diff --git a/__pycache__/wordfreq.cpython-313.pyc b/__pycache__/wordfreq.cpython-313.pyc new file mode 100644 index 0000000..4e7d123 Binary files /dev/null and b/__pycache__/wordfreq.cpython-313.pyc differ diff --git a/topmost.py b/topmost.py new file mode 100644 index 0000000..e2efcb2 --- /dev/null +++ b/topmost.py @@ -0,0 +1,32 @@ +import sys +import wordfreq +import urllib.request + + + + + +def main(): + stop_word_file = sys.argv[1] + word_file = sys.argv[2] + amount_of_words = int(sys.argv[3]) + + if "https" in word_file: + response = urllib.request.urlopen(word_file) + lines = response.read().decode("utf8").splitlines() + else: + inp_file = open(word_file, encoding="utf-8") + lines = inp_file.readlines() + inp_file.close() + tokenized = wordfreq.tokenize(lines) + + stop_word_lines = open(stop_word_file, encoding="utf-8") + stop_words = stop_word_lines.read().split("\n") + stop_word_lines.close() + + counted_words = wordfreq.countWords(tokenized, stop_words) + + wordfreq.printTopMost(counted_words, amount_of_words) + + +main() \ No newline at end of file diff --git a/wordfreq.py b/wordfreq.py index 5e19758..2ce24cb 100644 --- a/wordfreq.py +++ b/wordfreq.py @@ -5,7 +5,7 @@ class CharType(Enum): NONE = 0 DIGIT = 1 ALPHA = 2 - SPACE = 3 + WHITESPACE = 3 SYMBOL = 4 @@ -14,8 +14,8 @@ def char_type(char: str) -> CharType: return CharType.DIGIT elif char.isalpha(): return CharType.ALPHA - elif char == " ": - return CharType.SPACE + elif char == " " or char == " ": + return CharType.WHITESPACE return CharType.SYMBOL @@ -27,7 +27,7 @@ def tokenize(lines: list[str]) -> list[str]: for i, char in enumerate(line.strip().lower()): if len(current_word) == 0: ct = char_type(char) - if ct == CharType.SPACE: + if ct == CharType.WHITESPACE: continue if ct == CharType.SYMBOL: result.append(char) @@ -38,3 +38,24 @@ def tokenize(lines: list[str]) -> list[str]: result.append("".join(current_word)) current_word = [] return result + + +def countWords(words, stopWords = []): + dict = {} + for word in words: + if word not in stopWords: + if word in dict.keys(): + dict[word] += 1 + else: + dict[word] = 1 + return(dict) + +def printTopMost(frequencies,n): + list = [] + + for word,freq in frequencies.items(): + list.append((word,freq)) + list = sorted(list, key=lambda x: -x[1]) + for i in range (0, n): + if len(list)-1 >= i: + print(list[i][0].ljust(19),str(list[i][1]).rjust(5)) \ No newline at end of file