main compiled + tests passed
This commit is contained in:
parent
0ac6024b63
commit
a6e6f7405f
3 changed files with 57 additions and 4 deletions
BIN
__pycache__/wordfreq.cpython-313.pyc
Normal file
BIN
__pycache__/wordfreq.cpython-313.pyc
Normal file
Binary file not shown.
32
topmost.py
Normal file
32
topmost.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import sys
|
||||||
|
import wordfreq
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
stop_word_file = sys.argv[1]
|
||||||
|
word_file = sys.argv[2]
|
||||||
|
amount_of_words = int(sys.argv[3])
|
||||||
|
|
||||||
|
if "https" in word_file:
|
||||||
|
response = urllib.request.urlopen(word_file)
|
||||||
|
lines = response.read().decode("utf8").splitlines()
|
||||||
|
else:
|
||||||
|
inp_file = open(word_file, encoding="utf-8")
|
||||||
|
lines = inp_file.readlines()
|
||||||
|
inp_file.close()
|
||||||
|
tokenized = wordfreq.tokenize(lines)
|
||||||
|
|
||||||
|
stop_word_lines = open(stop_word_file, encoding="utf-8")
|
||||||
|
stop_words = stop_word_lines.read().split("\n")
|
||||||
|
stop_word_lines.close()
|
||||||
|
|
||||||
|
counted_words = wordfreq.countWords(tokenized, stop_words)
|
||||||
|
|
||||||
|
wordfreq.printTopMost(counted_words, amount_of_words)
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
||||||
29
wordfreq.py
29
wordfreq.py
|
|
@ -5,7 +5,7 @@ class CharType(Enum):
|
||||||
NONE = 0
|
NONE = 0
|
||||||
DIGIT = 1
|
DIGIT = 1
|
||||||
ALPHA = 2
|
ALPHA = 2
|
||||||
SPACE = 3
|
WHITESPACE = 3
|
||||||
SYMBOL = 4
|
SYMBOL = 4
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -14,8 +14,8 @@ def char_type(char: str) -> CharType:
|
||||||
return CharType.DIGIT
|
return CharType.DIGIT
|
||||||
elif char.isalpha():
|
elif char.isalpha():
|
||||||
return CharType.ALPHA
|
return CharType.ALPHA
|
||||||
elif char == " ":
|
elif char == " " or char == " ":
|
||||||
return CharType.SPACE
|
return CharType.WHITESPACE
|
||||||
return CharType.SYMBOL
|
return CharType.SYMBOL
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -27,7 +27,7 @@ def tokenize(lines: list[str]) -> list[str]:
|
||||||
for i, char in enumerate(line.strip().lower()):
|
for i, char in enumerate(line.strip().lower()):
|
||||||
if len(current_word) == 0:
|
if len(current_word) == 0:
|
||||||
ct = char_type(char)
|
ct = char_type(char)
|
||||||
if ct == CharType.SPACE:
|
if ct == CharType.WHITESPACE:
|
||||||
continue
|
continue
|
||||||
if ct == CharType.SYMBOL:
|
if ct == CharType.SYMBOL:
|
||||||
result.append(char)
|
result.append(char)
|
||||||
|
|
@ -38,3 +38,24 @@ def tokenize(lines: list[str]) -> list[str]:
|
||||||
result.append("".join(current_word))
|
result.append("".join(current_word))
|
||||||
current_word = []
|
current_word = []
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def countWords(words, stopWords = []):
|
||||||
|
dict = {}
|
||||||
|
for word in words:
|
||||||
|
if word not in stopWords:
|
||||||
|
if word in dict.keys():
|
||||||
|
dict[word] += 1
|
||||||
|
else:
|
||||||
|
dict[word] = 1
|
||||||
|
return(dict)
|
||||||
|
|
||||||
|
def printTopMost(frequencies,n):
|
||||||
|
list = []
|
||||||
|
|
||||||
|
for word,freq in frequencies.items():
|
||||||
|
list.append((word,freq))
|
||||||
|
list = sorted(list, key=lambda x: -x[1])
|
||||||
|
for i in range (0, n):
|
||||||
|
if len(list)-1 >= i:
|
||||||
|
print(list[i][0].ljust(19),str(list[i][1]).rjust(5))
|
||||||
Loading…
Add table
Add a link
Reference in a new issue