main compiled + tests passed

This commit is contained in:
Ivryxia 2025-09-25 14:54:05 +02:00
commit a6e6f7405f
3 changed files with 57 additions and 4 deletions

Binary file not shown.

32
topmost.py Normal file
View file

@ -0,0 +1,32 @@
import sys
import wordfreq
import urllib.request
def main():
stop_word_file = sys.argv[1]
word_file = sys.argv[2]
amount_of_words = int(sys.argv[3])
if "https" in word_file:
response = urllib.request.urlopen(word_file)
lines = response.read().decode("utf8").splitlines()
else:
inp_file = open(word_file, encoding="utf-8")
lines = inp_file.readlines()
inp_file.close()
tokenized = wordfreq.tokenize(lines)
stop_word_lines = open(stop_word_file, encoding="utf-8")
stop_words = stop_word_lines.read().split("\n")
stop_word_lines.close()
counted_words = wordfreq.countWords(tokenized, stop_words)
wordfreq.printTopMost(counted_words, amount_of_words)
main()

View file

@ -5,7 +5,7 @@ class CharType(Enum):
NONE = 0 NONE = 0
DIGIT = 1 DIGIT = 1
ALPHA = 2 ALPHA = 2
SPACE = 3 WHITESPACE = 3
SYMBOL = 4 SYMBOL = 4
@ -14,8 +14,8 @@ def char_type(char: str) -> CharType:
return CharType.DIGIT return CharType.DIGIT
elif char.isalpha(): elif char.isalpha():
return CharType.ALPHA return CharType.ALPHA
elif char == " ": elif char == " " or char == " ":
return CharType.SPACE return CharType.WHITESPACE
return CharType.SYMBOL return CharType.SYMBOL
@ -27,7 +27,7 @@ def tokenize(lines: list[str]) -> list[str]:
for i, char in enumerate(line.strip().lower()): for i, char in enumerate(line.strip().lower()):
if len(current_word) == 0: if len(current_word) == 0:
ct = char_type(char) ct = char_type(char)
if ct == CharType.SPACE: if ct == CharType.WHITESPACE:
continue continue
if ct == CharType.SYMBOL: if ct == CharType.SYMBOL:
result.append(char) result.append(char)
@ -38,3 +38,24 @@ def tokenize(lines: list[str]) -> list[str]:
result.append("".join(current_word)) result.append("".join(current_word))
current_word = [] current_word = []
return result return result
def countWords(words, stopWords = []):
dict = {}
for word in words:
if word not in stopWords:
if word in dict.keys():
dict[word] += 1
else:
dict[word] = 1
return(dict)
def printTopMost(frequencies,n):
list = []
for word,freq in frequencies.items():
list.append((word,freq))
list = sorted(list, key=lambda x: -x[1])
for i in range (0, n):
if len(list)-1 >= i:
print(list[i][0].ljust(19),str(list[i][1]).rjust(5))