This commit is contained in:
Love Lysell Berglund 2025-09-25 14:11:19 +02:00
commit 7675c0a910
3 changed files with 40 additions and 0 deletions

40
wordfreq.py Normal file
View file

@ -0,0 +1,40 @@
from enum import Enum
class CharType(Enum):
NONE = 0
DIGIT = 1
ALPHA = 2
SPACE = 3
SYMBOL = 4
def char_type(char: str) -> CharType:
if char.isdigit():
return CharType.DIGIT
elif char.isalpha():
return CharType.ALPHA
elif char == " ":
return CharType.SPACE
return CharType.SYMBOL
def tokenize(lines: list[str]) -> list[str]:
result = []
for line in lines:
word_type = CharType.NONE
current_word = []
for i, char in enumerate(line.lower()):
if len(current_word) == 0:
ct = char_type(char)
if ct == CharType.SPACE:
continue
if ct == CharType.SYMBOL:
result.append(char)
continue
word_type = ct
current_word.append(char)
if i+1 >= len(line) or word_type != char_type(line[i+1]):
result.append("".join(current_word))
current_word = []
return result