From 7675c0a9109d6dcd7bf0a7640c46d2a0ed30c062 Mon Sep 17 00:00:00 2001 From: Love Lysell Berglund Date: Thu, 25 Sep 2025 14:11:19 +0200 Subject: [PATCH] wordfreq --- __pycache__/wordfreq.cpython-312.pyc | Bin 0 -> 2398 bytes tokenize.py | 0 wordfreq.py | 40 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 __pycache__/wordfreq.cpython-312.pyc delete mode 100644 tokenize.py create mode 100644 wordfreq.py diff --git a/__pycache__/wordfreq.cpython-312.pyc b/__pycache__/wordfreq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9345f5680b1c22cecf0a4f7186751da556412e63 GIT binary patch literal 2398 zcmahKT~8cU^xpa2W%*iQTUyjfV?w8aQiQ5$8*RU^Ku=80>n0foiSc*)mDcw~2-&Cn%S+44gt7hKtR8P;59`z0NwmuOE4a@OT z#8q=?+wGY-**|as{IWUMBh*#LsgqSfts1&*5UN`GZC5p&DLW9Rbj#{db=#b|qWDKUucsTT>=Ez?ocHk|HmnoJ#s`*|s&mX2V-HD+4{ViggTz$Qv?CWgdw z3Blwc;X*<(#e_U9X|k^jrF1$rogn?;>^bx!(TX5&d@Wdk!^whgg!*_C((laO0HRIi8!QtnC8C zI@3K2XbzQ?P_exn3Ku(ep>RVfM%(JvzF@#TeTfn;Pi>=~B~T-FTXxy)?*Xl%0?O>x zf{a1E;uTK?CgV4<}dFP0{c>b<==M9-^4+y@7^hRhI_3?(XNw3_UafniZm|n zV-;#Fr(H8Wk@l*|YXT(`=~fO>(hMAJVW<_5hG_>ZF3W&Q+X4F_mbj$HI)r6A%G5)E zil{6o^CP8^x%g(Nb@|oBSD$bP%2IgVDp_;(9?^pF!Q>0UrqZ(9yx9B%9|+3hi&oku z?w*pc!~`WN!$DSIp3LF`I*)GthR`hMfwBsjol?{#py$C_@aoeE6}bN@$HQ8wA@)}Y z-GF$r;;iI_>hTIf-J({McfE!komAJ&$_07Xsa{fyprCkZV2>Ru#6Ae@q6M5`#n=ah zo&7Lcx>k^9`GW9&b_LPoW+2Y48*%khPu_q^b`Hf=4W%3#J z$o9dDyw#(yzrw7r^tFJN{t!?BLwgGGVCGN>oPDH)tL@pyxDbI!4%5yf&>qedp+0XB zJ7~Zc&FV`!j85D1g^ZK7HHCIDS9s!X-tv5bwUr-Yvm@wUKVsx5g)y3BA|I!NPWL$& zJ*?f-8dRHJ@tjFwr)LN~209kkeF6x2J{&2=erb#r$2OZ{%h|;eDhCy><2!!;yvZN%96Z*OUBaZO8dR*>r!k@ z`(pZ0`y>5Z{MdNhxzT-kAlW+}2SV`Lh#6wTg_I z+A1mvH5XqA9$?&no9zI5IKk5!1Y#9=JG0PZ(g3`jxmPc4eT5$~jF7m-`(lp5Un6U0!hZN%Hj0X96H9aNf23^4ozL*0>V$y553@`rHIBx> zD6WNlfprLFXQI6fv2**p>(QV|SoMKs#S7w7HbD1pubip=$=;y@pk)_zSt=?##`ss% z@t4F$hHz1N9uaY5!F!HiskF%W;L1=1!LoK~n=O_0C~jUkzt(ntxPpLu@cIMyGi{rZ c+s80ITv3n^S(sVpJAcOFhp`HegzB~a0kmEGQ~&?~ literal 0 HcmV?d00001 diff --git a/tokenize.py b/tokenize.py deleted file mode 100644 index e69de29..0000000 diff --git a/wordfreq.py b/wordfreq.py new file mode 100644 index 0000000..c2b5016 --- /dev/null +++ b/wordfreq.py @@ -0,0 +1,40 @@ +from enum import Enum + + +class CharType(Enum): + NONE = 0 + DIGIT = 1 + ALPHA = 2 + SPACE = 3 + SYMBOL = 4 + + +def char_type(char: str) -> CharType: + if char.isdigit(): + return CharType.DIGIT + elif char.isalpha(): + return CharType.ALPHA + elif char == " ": + return CharType.SPACE + return CharType.SYMBOL + + +def tokenize(lines: list[str]) -> list[str]: + result = [] + for line in lines: + word_type = CharType.NONE + current_word = [] + for i, char in enumerate(line.lower()): + if len(current_word) == 0: + ct = char_type(char) + if ct == CharType.SPACE: + continue + if ct == CharType.SYMBOL: + result.append(char) + continue + word_type = ct + current_word.append(char) + if i+1 >= len(line) or word_type != char_type(line[i+1]): + result.append("".join(current_word)) + current_word = [] + return result