From a6e6f7405f48b6ec972fe8bc0d6250c1a507aa1d Mon Sep 17 00:00:00 2001 From: Ivryxia Date: Thu, 25 Sep 2025 14:54:05 +0200 Subject: [PATCH] main compiled + tests passed --- __pycache__/wordfreq.cpython-313.pyc | Bin 0 -> 3199 bytes topmost.py | 32 +++++++++++++++++++++++++++ wordfreq.py | 29 ++++++++++++++++++++---- 3 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 __pycache__/wordfreq.cpython-313.pyc create mode 100644 topmost.py diff --git a/__pycache__/wordfreq.cpython-313.pyc b/__pycache__/wordfreq.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e7d123304f811d74459d823a94bebcb504db7e3 GIT binary patch literal 3199 zcmai0-A`M|6`#3ZzSqV$e3&GJCeDV0cne9WNTmQtm4zt4g4yNb4J*95a?D+F6LW3m zUYBN{OrtJXSrxA;w5{3B)GR!TB&{Er6PrwsxLifjLpZY*(2RE zXXea3Gjl$EGdsbcp8)-$@8!J(nUH^Ar%KT0v~v%dRiYD}zDq_ZqY-j~yhU`OmFQxE zUKK0Tn4n7uQI``!kEAQDIWTK(uZP z%|)pv-Li9Ysfz6ir+b~9iy*9$1YuMsuq+o&CIH()TU%Vh?LQ;2sNYi!!%ohbhT-`QV=gn3OQT$881uPgdQTHFjE^nm zxM|Ba?F^`;*m&%gCl3za9-j2%o1^ay-Sqr-hK46^>F?bfxaBGOhhy)Ik9uLlaNMM8 zrHrKOGV5;6H66oX=>8q}h&MFQn}|Cma}w4slg!S5Fqlc@=1kjl61OdPD0eq;n`Ls@ zuI{eUDj<-!@TRCuX#0OwrD{a4?&S@v`zJzjuvYei=|FW>T&n6Tu}qFY1_f(crK+`k zao>R**H>rkw>pG#7(*g!o(tTAWT`=#EglwrYfoCD58J#-$&d8JBgT3oIQt zn^4_-?_VHGAtmDC{@JF^SLUOOYx5Rh9-iTg|F<_?oCqJ;E_C<3ShWnFW)1V%oHM4)5o?rzEpk z)1L99`x(oQs_YC}04Q;CY1fl58$As%iWh)am&!2)z-Hi(M=f3~o{(}K%+kmnRq94{ zGY?GL`ov_HK!>H|Yyg9@@z>>B{uXS^Pj1$?tjrbW^3ifbQ@(Svp>?h254~UZ{=R?X zRM+~cuF|Q?uNtl_&gPSwiniQYQjV{6mz1_mt+Cki_)PwtvQPVLY9km~4@OG9$Y13k zA6PL8M#a9Ip(h%M`4OVNEHwv`p%GF?o4rWczZe^^pl z)}miM{HFOk`fcjllckQnm#uwe$-gwVI97b?i;mAb);dd#9WSJd<)-kjM|Vz=#_rcX zQs1;4AzCQ^6JEZlgEfdL!RyuqVU=8gYUOo%8S8SYyALXgX9=L&)X(08%YE>F5^81Y z+;!o)fM#{t1=Zz6Awpro7x zP<%8}Ji9plo00Du8uNpXMqUTdO?5;I=G%VgrW(BUgF`sHft%js4uBwlXbwR=0BGpq zBQ(|1Q!831o0Aon9E3rg!W&4qCIEoet}$JV)2n*{gf%81O!9OM65}0l>8R$(mTS&A>;m*W z#mO+&obhCqv}a9r9>-+9C;-;|oa1^Tz!#`djw)UN-$^;sPFbeoQ5(?-^ms#g5~^4y zhz>j>coJ9BxbEOF7+g&JmEwE~0wi)Em>>K;9N7q8SPx%#6~0)|mZh>ij&26 zU#XGTB58^KGtf|M|8jgi(3v0H3^o>T7UzGPSf-nS`epa=wc`7QzO}P21E)4M=oc1> z^My|~LTA=PXPyY9(7CS$zLuW(zM;?je;nUvzqa0f?Ypz3_WoDe^?%ExzLk3|2ELIH zw;ASh1{PaF&{lq9M~q%tPSmAv)$ZCL&d?hrq74B35ZYHpEysWyC`nIP0pR)B`U4oX;$eC HzU2P^23ca{ literal 0 HcmV?d00001 diff --git a/topmost.py b/topmost.py new file mode 100644 index 0000000..e2efcb2 --- /dev/null +++ b/topmost.py @@ -0,0 +1,32 @@ +import sys +import wordfreq +import urllib.request + + + + + +def main(): + stop_word_file = sys.argv[1] + word_file = sys.argv[2] + amount_of_words = int(sys.argv[3]) + + if "https" in word_file: + response = urllib.request.urlopen(word_file) + lines = response.read().decode("utf8").splitlines() + else: + inp_file = open(word_file, encoding="utf-8") + lines = inp_file.readlines() + inp_file.close() + tokenized = wordfreq.tokenize(lines) + + stop_word_lines = open(stop_word_file, encoding="utf-8") + stop_words = stop_word_lines.read().split("\n") + stop_word_lines.close() + + counted_words = wordfreq.countWords(tokenized, stop_words) + + wordfreq.printTopMost(counted_words, amount_of_words) + + +main() \ No newline at end of file diff --git a/wordfreq.py b/wordfreq.py index 5e19758..2ce24cb 100644 --- a/wordfreq.py +++ b/wordfreq.py @@ -5,7 +5,7 @@ class CharType(Enum): NONE = 0 DIGIT = 1 ALPHA = 2 - SPACE = 3 + WHITESPACE = 3 SYMBOL = 4 @@ -14,8 +14,8 @@ def char_type(char: str) -> CharType: return CharType.DIGIT elif char.isalpha(): return CharType.ALPHA - elif char == " ": - return CharType.SPACE + elif char == " " or char == " ": + return CharType.WHITESPACE return CharType.SYMBOL @@ -27,7 +27,7 @@ def tokenize(lines: list[str]) -> list[str]: for i, char in enumerate(line.strip().lower()): if len(current_word) == 0: ct = char_type(char) - if ct == CharType.SPACE: + if ct == CharType.WHITESPACE: continue if ct == CharType.SYMBOL: result.append(char) @@ -38,3 +38,24 @@ def tokenize(lines: list[str]) -> list[str]: result.append("".join(current_word)) current_word = [] return result + + +def countWords(words, stopWords = []): + dict = {} + for word in words: + if word not in stopWords: + if word in dict.keys(): + dict[word] += 1 + else: + dict[word] = 1 + return(dict) + +def printTopMost(frequencies,n): + list = [] + + for word,freq in frequencies.items(): + list.append((word,freq)) + list = sorted(list, key=lambda x: -x[1]) + for i in range (0, n): + if len(list)-1 >= i: + print(list[i][0].ljust(19),str(list[i][1]).rjust(5)) \ No newline at end of file