main compiled + tests passed
This commit is contained in:
parent
0ac6024b63
commit
a6e6f7405f
3 changed files with 57 additions and 4 deletions
29
wordfreq.py
29
wordfreq.py
|
|
@ -5,7 +5,7 @@ class CharType(Enum):
|
|||
NONE = 0
|
||||
DIGIT = 1
|
||||
ALPHA = 2
|
||||
SPACE = 3
|
||||
WHITESPACE = 3
|
||||
SYMBOL = 4
|
||||
|
||||
|
||||
|
|
@ -14,8 +14,8 @@ def char_type(char: str) -> CharType:
|
|||
return CharType.DIGIT
|
||||
elif char.isalpha():
|
||||
return CharType.ALPHA
|
||||
elif char == " ":
|
||||
return CharType.SPACE
|
||||
elif char == " " or char == " ":
|
||||
return CharType.WHITESPACE
|
||||
return CharType.SYMBOL
|
||||
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ def tokenize(lines: list[str]) -> list[str]:
|
|||
for i, char in enumerate(line.strip().lower()):
|
||||
if len(current_word) == 0:
|
||||
ct = char_type(char)
|
||||
if ct == CharType.SPACE:
|
||||
if ct == CharType.WHITESPACE:
|
||||
continue
|
||||
if ct == CharType.SYMBOL:
|
||||
result.append(char)
|
||||
|
|
@ -38,3 +38,24 @@ def tokenize(lines: list[str]) -> list[str]:
|
|||
result.append("".join(current_word))
|
||||
current_word = []
|
||||
return result
|
||||
|
||||
|
||||
def countWords(words, stopWords = []):
|
||||
dict = {}
|
||||
for word in words:
|
||||
if word not in stopWords:
|
||||
if word in dict.keys():
|
||||
dict[word] += 1
|
||||
else:
|
||||
dict[word] = 1
|
||||
return(dict)
|
||||
|
||||
def printTopMost(frequencies,n):
|
||||
list = []
|
||||
|
||||
for word,freq in frequencies.items():
|
||||
list.append((word,freq))
|
||||
list = sorted(list, key=lambda x: -x[1])
|
||||
for i in range (0, n):
|
||||
if len(list)-1 >= i:
|
||||
print(list[i][0].ljust(19),str(list[i][1]).rjust(5))
|
||||
Loading…
Add table
Add a link
Reference in a new issue