Group-1-Lab-Word-Counting/wordfreq.py
2025-09-25 14:54:05 +02:00

61 lines
No EOL
1.3 KiB
Python

from enum import Enum
class CharType(Enum):
NONE = 0
DIGIT = 1
ALPHA = 2
WHITESPACE = 3
SYMBOL = 4
def char_type(char: str) -> CharType:
if char.isdigit():
return CharType.DIGIT
elif char.isalpha():
return CharType.ALPHA
elif char == " " or char == " ":
return CharType.WHITESPACE
return CharType.SYMBOL
def tokenize(lines: list[str]) -> list[str]:
result = []
for line in lines:
word_type = CharType.NONE
current_word = []
for i, char in enumerate(line.strip().lower()):
if len(current_word) == 0:
ct = char_type(char)
if ct == CharType.WHITESPACE:
continue
if ct == CharType.SYMBOL:
result.append(char)
continue
word_type = ct
current_word.append(char)
if i+1 >= len(line) or word_type != char_type(line[i+1]):
result.append("".join(current_word))
current_word = []
return result
def countWords(words, stopWords = []):
dict = {}
for word in words:
if word not in stopWords:
if word in dict.keys():
dict[word] += 1
else:
dict[word] = 1
return(dict)
def printTopMost(frequencies,n):
list = []
for word,freq in frequencies.items():
list.append((word,freq))
list = sorted(list, key=lambda x: -x[1])
for i in range (0, n):
if len(list)-1 >= i:
print(list[i][0].ljust(19),str(list[i][1]).rjust(5))