From 401e7e5497284b327282e834d16baf01bf504d73 Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Thu, 12 Feb 2026 20:07:09 -0500 Subject: [PATCH 1/7] - Extract info needed from ZIM file --- requirements.txt | 7 ++-- scripts/extract_wiki_zim.py | 69 +++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 scripts/extract_wiki_zim.py diff --git a/requirements.txt b/requirements.txt index c2685294d..7b5077898 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,8 @@ # Run the following to install: # pip install -r requirements.txt -pandas -dtale \ No newline at end of file +pandas~=3.0.0 +dtale~=3.19.1 +requests~=2.32.5 +beautifulsoup4~=4.14.3 +libzim~=3.8.0 \ No newline at end of file diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py new file mode 100644 index 000000000..27a2c81e3 --- /dev/null +++ b/scripts/extract_wiki_zim.py @@ -0,0 +1,69 @@ +from bs4 import BeautifulSoup +import os +from libzim.reader import Archive +from libzim.search import Query, Searcher +import csv + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_html")) +ZIM_PATH = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_en_all_maxi_2025-08.zim")) + +os.makedirs(OUTPUT_DIR, exist_ok=True) +zim = Archive(ZIM_PATH) +searcher = Searcher(zim) +print("The Zim file is now opened") + +#Fetch the html AND the images and put them in a folder +def fetch_wikipedia_html_with_images(query, save_dir): + q = Query().set_query(query) + search = searcher.search(q) + if search.getEstimatedMatches() == 0: + return None + results = list(search.getResults(0, 5)) + best_path = results[0] + try: + entry = zim.get_entry_by_path(best_path) + item = entry.get_item() + html_content = bytes(item.content).decode("UTF-8") + except Exception: + return None + soup = BeautifulSoup(html_content, "html.parser") + for img in soup.find_all("img"): + src = img.get("src") + if not src: + continue + img_path = src.lstrip("/") + try: + img_entry = zim.get_entry_by_path(img_path) + img_bytes = bytes(img_entry.get_item().content) + except Exception: + continue + img_name = os.path.basename(img_path) + img_file_path = os.path.join(save_dir, img_name) + with open(img_file_path, "wb") as f: + f.write(img_bytes) + img["src"] = img_name + return str(soup) + +#Go through each row of the tsv file and try to get the movie on wiki +with open(INPUT_TSV, encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + tconst = row["tconst"] + title = row["primaryTitle"] + year = row["startYear"] + # folder for each movie + movie_dir = os.path.join(OUTPUT_DIR, tconst) + os.makedirs(movie_dir, exist_ok=True) + outfile = os.path.join(movie_dir, f"{tconst}.html") + if os.path.exists(outfile): + continue + query = f"{title} {year}" if year != "\\N" else title #if year not empty + print(f"fetching Wikipedia HTML + images for {tconst}: {query}") + html_with_images = fetch_wikipedia_html_with_images(query, movie_dir) + if html_with_images: + with open(outfile, "w", encoding="utf-8") as out: + out.write(html_with_images) + else: + print(f"no Wikipedia page found for {query}") \ No newline at end of file From 0ac1234afaf1c1fde69768e70db55cabc42adc0b Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Tue, 10 Mar 2026 13:10:25 -0400 Subject: [PATCH 2/7] - Fix directories --- data/raw/wikipedia/.gitkeep | 0 scripts/extract_wiki_zim.py | 10 ++++-- scripts/scrape.py | 2 +- scripts/scrape_wiki.py | 69 +++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 data/raw/wikipedia/.gitkeep create mode 100644 scripts/scrape_wiki.py diff --git a/data/raw/wikipedia/.gitkeep b/data/raw/wikipedia/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index 27a2c81e3..df15ec220 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -6,7 +6,7 @@ import csv BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) -OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_html")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html")) ZIM_PATH = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_en_all_maxi_2025-08.zim")) os.makedirs(OUTPUT_DIR, exist_ok=True) @@ -53,16 +53,22 @@ with open(INPUT_TSV, encoding="utf-8") as f: tconst = row["tconst"] title = row["primaryTitle"] year = row["startYear"] + titleType = row["titleType"] + if year is None or titleType != "movie": + print("Skipping from TSV: ", title) + continue # folder for each movie movie_dir = os.path.join(OUTPUT_DIR, tconst) os.makedirs(movie_dir, exist_ok=True) outfile = os.path.join(movie_dir, f"{tconst}.html") if os.path.exists(outfile): continue - query = f"{title} {year}" if year != "\\N" else title #if year not empty + query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") html_with_images = fetch_wikipedia_html_with_images(query, movie_dir) if html_with_images: + if "Directed by" not in html_with_images: + continue with open(outfile, "w", encoding="utf-8") as out: out.write(html_with_images) else: diff --git a/scripts/scrape.py b/scripts/scrape.py index fb9f976ac..9559bf152 100644 --- a/scripts/scrape.py +++ b/scripts/scrape.py @@ -19,7 +19,7 @@ response = requests.get(url, headers=headers, params=params) print("Status:", response.status_code) print("Content-Type:", response.headers.get("content-type")) -print("First 200 chars:\n", response.text[:200]) +print("First 200 chars:\n", response.text) data = response.json() diff --git a/scripts/scrape_wiki.py b/scripts/scrape_wiki.py new file mode 100644 index 000000000..8c7c1e7bd --- /dev/null +++ b/scripts/scrape_wiki.py @@ -0,0 +1,69 @@ +import csv +import os +import requests +from time import sleep + +HEADERS = {"User-Agent": "cse881"} +SEARCH_URL = "https://en.wikipedia.org/w/api.php" +BASE_URL = "https://en.wikipedia.org/api/rest_v1" +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) +OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_html")) + +os.makedirs(OUTPUT_DIR, exist_ok=True) + +def fetch_wikipedia_html(query): + params = { + "action": "query", + "list": "search", + "srsearch": query, + "format": "json" + } + + resp = requests.get(SEARCH_URL, params=params, headers=HEADERS).json() + results = resp.get("query", {}).get("search", []) + + if not results: + return None + + best_title = results[0]["title"] + wiki_title = best_title.replace(" ", "_") + html_url = f"{BASE_URL}/page/html/{wiki_title}" + r = requests.get(html_url, headers=HEADERS) + + if r.status_code != 200: + return None + return r.text + + +with open(INPUT_TSV, encoding="utf-8") as f: + print("Opened file:", INPUT_TSV) + print("First 500 chars:") + print(f.read(500)) + f.seek(0) + + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + tconst = row["tconst"] + title = row["primaryTitle"] + year = row["startYear"] + outfile = os.path.join(OUTPUT_DIR, f"{tconst}.html") + print(outfile) + + if os.path.exists(outfile): + print(f"Skipping {tconst}: {query}") + continue #if exists, skip + + query = f"{title} {year}" if year != "\\N" else title + print(f"Fetching Wikipedia for {tconst}: {query}") + html = fetch_wikipedia_html(query) + if html: + with open(outfile, "w", encoding="utf-8") as out: + out.write(html) + else: + print(f"No Wikipedia page found") + sleep(0.5) +print("Completed") + +#https://en.wikipedia.org/w/index.php?api=wmf-restbase&title=Special%3ARestSandbox#/Page%20content/get_page_summary__title_ From 36af063777eace996fd62ae11c9e07fda638f1c7 Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Tue, 10 Mar 2026 13:17:21 -0400 Subject: [PATCH 3/7] - Delete the folders if we skipped a movie due to not being found --- scripts/extract_wiki_zim.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index df15ec220..ec7854904 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -60,14 +60,15 @@ with open(INPUT_TSV, encoding="utf-8") as f: # folder for each movie movie_dir = os.path.join(OUTPUT_DIR, tconst) os.makedirs(movie_dir, exist_ok=True) - outfile = os.path.join(movie_dir, f"{tconst}.html") - if os.path.exists(outfile): - continue query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") html_with_images = fetch_wikipedia_html_with_images(query, movie_dir) if html_with_images: if "Directed by" not in html_with_images: + os.rmdir(movie_dir) + continue + outfile = os.path.join(movie_dir, f"{tconst}.html") + if os.path.exists(outfile): continue with open(outfile, "w", encoding="utf-8") as out: out.write(html_with_images) From 2ec6f8c28a42f5b6f9cc0b4769480c99885c653e Mon Sep 17 00:00:00 2001 From: "Vadella, Anna" Date: Tue, 10 Mar 2026 13:29:56 -0400 Subject: [PATCH 4/7] testing extract_wiki_zim.py --- .../Mean_johnny_barrows_poster_01.jpg | Bin 0 -> 15044 bytes .../wikipedia_html/tt0074885/tt0074885.html | 175 ++++++++++++++++++ .../La-meilleure-facon-de-marcher.jpg | Bin 0 -> 10766 bytes .../wikipedia_html/tt0074888/tt0074888.html | 159 ++++++++++++++++ 4 files changed, 334 insertions(+) create mode 100644 data/processed/wikipedia_html/tt0074885/Mean_johnny_barrows_poster_01.jpg create mode 100644 data/processed/wikipedia_html/tt0074885/tt0074885.html create mode 100644 data/processed/wikipedia_html/tt0074888/La-meilleure-facon-de-marcher.jpg create mode 100644 data/processed/wikipedia_html/tt0074888/tt0074888.html diff --git a/data/processed/wikipedia_html/tt0074885/Mean_johnny_barrows_poster_01.jpg b/data/processed/wikipedia_html/tt0074885/Mean_johnny_barrows_poster_01.jpg new file mode 100644 index 0000000000000000000000000000000000000000..def287049bea0238de99b811535e95a13c6b7b68 GIT binary patch literal 15044 zcmV;#Iy=QuNk&GzIsgDyMM6+kP&gp4IsgE$+5nvaD*6C?0Y1%Cpi8HurlPPIJ=pLW ziDqsfWto)dWslw;TKu|4uOato^yu%r>GVJBdgT3ezC+F5?r*9esPF53lf78phut#& z+iib&3;RC{e4>2k_kTcaz51v4US|I<{IUIi@!p^K2lBH}Uu$ndfG^fRyLb@l_3$U_ zzhk}uzo2>f|BuA$*8PHio`1FVZ2xb|$J!pk45EKmsS11Ze>hY2G9|B~GmFeIenhs5 z!-?(>n9EMY>-Q*?mA<$?0MT!N!l02(^|=A3Z7tuM!!=eN`}-oW!-1C?fAnlO6L9fQ zuE(Yzf3C_1L=B(6B(NA+Rwym{C-*RULrLx*K4leVLi}%{WNb7O|A?13J}_+eaXb5F zV~O~Ai8rM+gc=)Ug3yJ}N`#MSW&~jh{)8|$`Z~XAJ!FW3iP|wg`QuDXT^P(-WB112 zH4d8V*YrLvwI%-9(^R8k&Q^It@Z?Ih=5D%AW@)+Gb6XY+cr65(NU+|`o`~)$vfZc2#w?*IE zB&V`l4*G+Q(WedO4D{{%c@4knaVwr2$4ogrT;P?$cA^_~ zVN6ImK}GY?Wqqi7rIb3ywMllPJ(vsQJev6-9Vnx{!#8uJjj9!ZlmzLsQqY>0Du5c( zQ)vhLt`V;YELR5huTRIdAg~o=qp;%rJXVFR7Xyxx_%n=dU-Kk12r0}|ZErqZHqKJYWpHNoL2LELk~+gNAu0AT>p1_)mEBeL8&U=793Y(r zAF&XGd>~TZ(2R`|5*skWs9acV^uiE$wpccV^MB87JLIv|#r>%I%`qAaXXMO#m83IK z1!}gwm5eHF8kKcrwY+ZL2+c&RpzT?W(WUv~Z=m7i*{OtV4Senh9Ni0BXtEy-xRXJ2hS5@v__;vA_~Nqz=qmg+1}4V^&Dv&6S?_B9<$y; zk7KrQ(Nb3{oSsEZYj_Jk&{BW?wj0S=f;R=kC&KI9WgDA;wKTpD!o}ju!~WWo%*3-T zS)wq9LQ8Ol^gYM)!jU9ps5VofaG;hbU}do`ca@)qy<<-9s4Mz*Qi;1ox_miM=4do( z=`B{O$$xn>B_KvNoskN4>((YZ_8Y6!{^7@|3mXOtE%6Sq&F1naQGVct^5AW9KSmD= z+n=2M5P#zd?B+&g+&0~2C1&Typ^nozNgr_TQ4^D!DISmczHNjyjhUGQ{tPhfDu(<~ zXML?XDf)cRJr;Ld=ls~}F0q%e13yREzS9;!lSNKWknyYZ*8n z`ymcahE+d3Yk`kpg70=_`!jvRHRX_;K<~B}dQKrk@(GddFDyAJz+jNQk$j=jw$Mw9 z#FKoaU-7oQLnhGqs$$J8o9lTeo$C9aD7mqFF>vKiIfaV*u=~Y@TzM;B3hCc?#n#74 zNMLhwkX$r4gD7}2d9+s5%PMSF27{JiepDN8_W@e7O{IpcEhCi!%CC;kx7g}X)OMXE zAxl0ZUU|bF!QzS;UXfof_1wp!Ap>>>?CoE~zJYErqmCZ=R`xRNEHaIum&}Gvo8W$_ zw_?9W(_b&Km|3=~NDJ45m-tjUG`qRJ-TA%5uMpWBFS()pb`3zdcN9P=wYAmVta2H% zw*Rb0FRB-MFHY&e!Jsv|1Q}*jz{a<3w6BlVS=lfbg_C(omE{Xwz3}C>4CHV&dveu4 z>mrv2u#2ZD%aB8O$~5^8paA~-FBkzl!|^b@u51T&Z)z*mS9+SzM;UCZ#(k40BceMPG-Qw_`Scjrb>wrs1qrZcDtZ0lkTW4-n2K{_4V^KvuAtW5%KoQ%-! zBU1$dJ1+j2x=S0kuvC>?z{q9q_){%{~ejbl~yRj3{|3NZf} zPnqx}jiB}kp7DGp7$ceDXl-|;xtR>sbcQJttekd!QmsRdV)dLY!t|2r`s1GDwj+}S zQH6+EEd3|-HQWg~scUOjF9LP}w2;c!>Jh0Slx={^QmEs)$*81tgC_ zNLaodmso)lq^+xcRxb+enh^$g#=IIkS!U4cN`yH+1Rist`Qj0w<9RCvjQZjU>|RDF z0uvCH5Px1j_%f%D;~4C%kyytd#ZUn7Ln*(UW!XNQH#%M-=m~90zrNy_y6IB3)?A*D z+<((_)1rkYazl<{KhA(dWamgU7TV3C zhV6D3Npo>Lainm}Ibvw`ANh#t%Agb|P>Wct)WD&0HVb{+CClb`o;VMfv|OL+swjB$ zdr>(^%#@VB`+qKs-SF#o8sd*N< zJT)-HZj@5)E0?VON^1podJTE<^pTv<1m&yF5c zp8r|Ke`P{U``y{VD-4?OSR5M0yW71X$eRu#u7Pkst612zT)D}-IjE%e4Hy1CP z9WI?r)peYae<|Xl7J5mg$=gbzZU}0(Qa`M>Uw|?#`Z0kX)IrDk6B95Bu$F>^-iMV? zzQ?F*#mcuG<4g8Y0AqMF#fSu20FD>8gwZL~<$V_KaA71>Zhu+RZuaT&8*zw9=n$I{ z+3)=Wy-(l#Xo6ibXmd`?r`Sw@2=JE?!}TFDft6B{QJS3Tzj1kxf1qtq1}5ky>Y2)) zko*fHgU9W5c)jGt}h-C0Wnz9JpZzG*h7(JyyB@M9y%9=Tka3B{SHoy<;`6Ygo;n zIfeMrur8(5-6qw*BBcR8OTK<5U^soThgTN7P}##QnsqVlx`nBlZYZM_Ko^d+SWe*A`Yf6~ARVGyUkf_bYM#X(B8kQ{n34U*t)5Ya~PjCb-JBGPcfHmg8M^h_)gBw?YM-Ii8j6=9FKg$g6l?=o7v zbSUD32-Rzw07f@a5QX~!1V6BLdRU89^e_YwTHthp|DZKa?!$5eydP67>Y;c=CTwhx zHK&C(~&jlgbO~I$AMwxY9 zfA>5ENSWbILz0CQMYKC86P;_L{7F_pXFJDJ0`;b}V~xGsf8kV)ClW_Ar#zL%mxK3E z)^Wn5`WF`PDdh($=)$v3op3hTkF11Xh)}~FcJUaCvr;bKY>%MP4 z)MCWfx^DxQmqMtu__g_vy#<#E8S^^Rsi5t{1^+6_>*g5@y+57^SNJ*eR-EX=NdsY> zhQooU-7JEUP;}O9{Yf#%qo=Hds~#69(tnUo{4g{G59zk5?I3pUM}3AIj;nbjT;M3{ z<0&JwXu?}Zb{$D8JzxcNAL-boKD4Gb0jWMxXRNz)TxXUUHE9EIuMbY@yZhLoBz1bi zO<@f*Q{Fe=hn}PzCoHzi2ab;okmByruiS}Z0dtQggBQl-uLq7`+aU*_QM0ZvNsQct zg;oZk57dUd+jkDNj8K|+J)ZJEcxhC++0w=+xU zU&`o0GCm1-^L|)44Ds3SH+PMr7xU;Peq7@d*?wPFeK(z;tdEexpVVL7*1b9GzE;nukYo(Y9;eSx!+m%gsyEAPo<`#y5#->_4<5Qn#Ua?B39H;-+-+*8M&TjaXrit|XA_^Q(e0@~3Z4r}zp(X1{Z!WSOyh@t?B@rB z+M)unr_2UHdRYpk1@33*U>TbwY*Yz|9UtYIO}I95+K$uH<&1xaS)TfBY-&lJ1m%wN z;EvviRm2PA@fAl_d+J#fj~$*Rov|%0pX%k|Z(76(IbQDE69@Av_*KzFK_9$xN^kzG zKr30%iwOmAbyeDp6(Y!&tK9>Ht#A$4U*Qt9!ks_uT}|`NiDrTRDhf{lM&HM}D2KV2 z3#`LqX8ZgeTN3>C1P(+AjxTYFMh5l0^^MHwg<)`VNVe>o*Oz4%meqGAy%Qca zuI`A0*|C2lkY=%kcadwkUn;lDY5G>lss$TM!6 z=)g*+&mEi~Qo$)R>AQaY?y96t$5Tt_G7FErsG&holI5TdQ3*k`37y5L*#Bg=K4K1?z9F!BHFg=zc>WGkeRQIydp%r$Y^=mDf<>mlV3vzx!WBZEq%7Lv4qT zkjl7~V+Gi6_$hUf?+kkO;Re@trzY2RJ1AbqT1aC&u1(xojQvs{3WH}N*zv9n%6m$a z!x806OQm*=OeV|a@wU4R*ka*6MTzwC#4->3XKyz|bFWt1XbbIo`;&@e`xRNlrQman2j^Id4FR z^%nq=AJTL6GJ(24q~4OErv~X@ZR^!kSgHd5QK0_wUR1D0Pasi>ufd%fW^V0Q@O}pI zFVe&Y6#;RnXp(&b1e`izDhY~N{x%nro2&tW$QXmZ%)DQBs4DPEL-<#DLOZKS;_3!% z0aL-3Q5yr1Ri@BbeE*lx3*KmzEQBPYV>qz-2y?7A9~E|T;W(I0{0#X>En~seoJ381 zI2UUDL$v)+64|mV0J~YJcZ?E0O_R}*-I+;e!G0PT?eT^IEPWh~ID2NdR_cH(kFhS( zZr%m9l`E>HYEUK2dPE>+@N2JumrO?tl~<3>PNmR}bW4weGg0SJ)so{SK%hN9Tds* zPo8zVd9r@>fEpOXZIPngW!*Z6Wy)c28c_?hSG@xMh~@qiAV!Q%^Orup8cJWmrMFW3 zFlh{`+{+RH%J?=F%qc&c%YlfKJ;GuwPY2?h_fII-XNw+)Fsu`g+IkGsEc;rrmUKz{ zWU|WHn1@mK7ctEs6J1Te@)fw3rYDsFcAu&O2EKO{AjyhYiEG^tLI`L#i$_@6o6dpM#k2@ZAumaAzOBZn)wsXc!3U{DUc+D~nn&m=TT)uv#8VmMeVM4Z7OHEZ zJ+%IGyGQgm-Y9eY=sxy_#tkJd7RIRC#bKYtmq}AawK6~zdX%=_adPLkLlUpAUSS{% z1YHA%)NXYLX3~5Q(+TF$42sthWwj@ka(B=_H0kT2%JJHTiy|0IW~G#_ZOefUmY8z_ zhfAe*XaL_EUf=q4k#u9{TpSOOlii*f#ItGkVJ_+hSo2d6Tc?_7;%`9eHd7$7U|jPQ z89r(U_N5O5K(m=fKvk9vZCVs_rzvFcofLibJCi7d6Ar|TRW_y_>0b%;V$y`$4JkgZ z91ht|eKxyoejjCF$`RRFrB1Gd7+QlFax9KzJtJg%ye(*!O8*ap4x~8dl9KovrGd*+ z(=h|fZg$Hr^c+;XY^oA<_6vpaGngvC=jm#;3TNcsfiTol0LGrV~l0sF*gdu zt(~P7h}FaEf13H3kU77{T!Fm0-3DVB75t1&D)1^{0&O#x2b;AkWVuF|VqW0e|84xx z{sX%tzkUr$9rWO9^}opf708i369*jN;Y>n z9MJ+6?&E!nc9U4OCdtnNXKE|l7b~Ssfu_wr*v z;xuEH;mIkEcKP05jC?yWyyhgAoxp#`8|_4@C5=Qf)1gOlRLZ`#hs1E2Xgmk0AuCc`-W2{uqW>nq9#>d~->n5wO3 zI2j|hlecvJl?ST9JAXo<~j;-h!zTJh=nTg z5htStfgA8eSoaKfI+!mV!AM42JqyUtS;Car#%x%`>{1-Ozrf12*hZzg7~KGw7Od+h zaebx}PQ+w^n&E+i#}rb?z-dwUB#>i1SiT_kQ|)f?AI24={tuT-;%l_4J4{FmIyg^fW9fj6PbdGR^LA*tR_>`?h&=ard`{! zi_;Xm`GFQQKZ9uM`#cCmq7{#QjTzL?M)ulXQFu?@>)Gbiz8IHBrf85iJ|!lyW}wF_ z(ou$JBRX`(;G8F@`iM!02qryy)K(~tH#~NGRyp!YRA8^7-*!}wKS}r+-#0tI{@WY+ z5F;qngkH@1+A&Gh>CiGmSOo`nuMI{Db%Ke{do_jYCp;2~e2`-IXaIxfJC_|^t{@Eo z6!W4D=a*UAWwO54o0N zwx+yz^)@0>U`vJY>ZZ1ZN9R&)(zWH-{N7o!(L)7-W5`{@tJ$RfGShS*tSfy@XK;bR z(1z14E*j5``p_aV!R(T{i^x4Xn!VL=-0)`lJT#;AVw3vY4HCc~Z-C;@H~f|e(tmAc zp)4}`gOk`IDv%do%y@yG>Tkdn^_*bj`)Q~f)8hhSk;ozTRDAggmmCt6=(i!}q!MjC zHcTeqO#42#EN9)7=*&bDNkNha@^ze)?)6G-x8hc|SeT7yA7xAkip zrs-Ziyw&M<8XV#B;0zotGN~E%4X8imI1P*~d6felj(Vr# za`s8_IcxK5#B>tH3f+A?%AzV{(`5n^RpvwVpEQ@q`t#FDWN+h|O{7qdK7gs3%|}XhLr#`#H&EiFxkrS}(g-(}p7pl+ zJy}fzAM4HOLW&i2o4;9haLabezVkd0>pj7hH~tZ#I^DfG-<>a~h+I|GjfYsKq%1-! z3S-BcWaMUthAs{`!ix?y)^m$)SH+svHW87hK5JsCwwbS|F@7s$ zjXu+o+<}_E*K4O_&N!>6Jgls{&YA-sw%EL_9y%3ki`3f1k;i&kKV$x5Mh1qGgfF>G z7$qu;Wd4#Evq7wvM`NU)5IAy?Bk^5}W@5niUd9m7))_9^XK8{}Y#Zr@aS6{}@n#H|l#?VHpH-PWj zk`e7HyQ?uD@+4~p`dAQ}fAUOlaeWsgItfV;AK>GKaJG6?VWu^fGrtv5Vd8D2Ni{mO zYo^cTQAw(UvHHW2v*L9aL8>W zE1e;2)8Kf@d$3@;zBoU;!FfE(b=A1CV>O-F((Wz+;tHF0on3I;9i7k*zF&+#HTIRw z@W*6<3PvG$I;&(3OFA0)gE;aaS;`AH3w&6x=)Y{*0_U~J;r90*5+#|cu{2x-`cm}>CEeRV7IH)gKfg`$ds6${v*VcMwnQ-UKFomY?LR76 zhKYa4q74S2`P55FML^RVtgA$OVOd;)?9U8(ur7mymMqvDfP__mixoKT1s>gWX+f{< zjFI;R(1>(pdWHdkHw9LsWE`wZiz8X#~kI8=|KY&29o(iS3C6>mq z@Ho0;+A+xOX<#6QbMXrT@#32k7CR6a@d+o}Jv=GxGDiw+dMWi{tW0BF`P}7B+2LYB zaWKbVI4Y`?E&r~Ha}zTnIh6~GAqBV^yDFQo#9K(y2Qdag)H}jmXUt#H^!nx2JA9=_^1onPHR|1@LvPP{>)x2_tr!8*$l z@H9MK*!T;83Vz*-Vp=zjCIyD&vA>Kb7R(@<_AHna&gN7(3Q&Cj~Xk2J&c*%cW{!+GF!hpTO5q8<}5kR~j38AgxD_tti1MB*PN?k{_-MJ0;Pr#6hIrEkI)NZ|nE3WTPK z5fvLWOgWL&F1CkXhF5C*1kMYo3gt<1uiGtxTMq^0bl)M#eAes>I06KpFnsjxLHK9v zeL}fXUDbpt!-<~Rxga`6zl9x}jwad93!$@Yuz9!a9PAkqUXNd`u(pyxx_L>#YOH|0 z@^reoy1lK3Q0%yRXc2kZL@MxMrNsnaX5ku^W&o*Wj3NQzPbe<{elx$(Kk@;Y; zgZBT;!0;(RZ~NsqX&+$3evYjZ=pd9W%@8z~4W`+)8i89u`9Fr)(p(;oBma@~OQ6Gx zkk5djfL;5Z)mdBEmd)fWPnP$d118ST_$1Zc=MIMa(y^v9A>cd$vR;9$@`Q-`p3}}R z7{PJP!&c{1Z-z8Qa@8C+wQ724P*4s#5Z??2MCnef%IRfx(G(-WKB*)vwF7L5ZZoxe z2`Zp3&(aRfC@He(9AJ$)LjJiHal=ZeQ!a`vwU*U^Yn%U$0j@n<^6txCud2M#0@*d<&q6@JBx~h$+Q86 zrP}8c)P=lfKCN43ej(7Z-$i4c=YF`67(G2ck+pwVovn2A+#LQu4LKG*ipu~xugIJm ziwDl#hxE@|x)4~`vb&)$4_|z_`oYd27~&rr%{D+&8!I&+! z4pB+{+t?Rt#6w{6`C8Ch4s%BnibZJ(@r%4kjQsu54pxgy#^%Sv@EYTDOYsROFC!8cA`q_#n zIWK0#Z@rb)(@+{vDf${7JBx87h2TX2GzRo=Zf|>MER~HdNDuS9qrDWwR997$TZ}3v z)kyQ&SfbDNisXX4zps<Zje7l6bNSffuj%@J1J2j*U(E$>4O$}H^pv^RNkU?+^f6FHjY*-H)OA>qw7Q}NkB`?h+M;vD*`Sv z4)=2EUSqDF5Ukxu>a`eyJ|(tchqfcRCJ;>cKyiR{hmZ_#Voo1I+kJf$g>ok~!r7kbR4l5==jWF9 z2g^LAKK+CqQ=S~F#B;-L86|wk7jcn&PFaU`V<&i^5;V3tkmrNQ*MNFNz{XMn(~I7M z*)@|6hK7&)Hi?c^KA6s=MQ~#`lC$yos($%24K&JH+C4xRZU~9Nik<@*4ZdQ^h zj)UB}Qs6xpH@(Au?87)kGzZ$J*Jmrq@EwkCLq6)zhFrbZ=Oh@@I59~ku?t=Kou1=F z*Ks&&{%QN|OD(+Bz|tmZaueG@{$G&l5z1d={#+5kNtjP=4jrAUB!5}1+YES86-VR) z_U1Wy$=@Q+26XXi?)J=yi{$2&A%1!;U4tl52}mDwiZ5c4bTWfe6s0>&l1+e*VLhj2 zRrM+`&fF@_L;;UIUUbOpJ|kwFZ6<-P{-@#Xn_i;)o8O0$lJ)s2$mdTXU&Uj!nIh3s zf3A~@}`*e;e^0`z1m?sfN)NFWnbaQk<$oqgFNr(a(O#P(YaX=n> z1`X7JULK9MrmPQ!KoUMnO;{*AxhtwAs$UMR_#w#z;@nRHUT$NH0L!tLCPvz9F;9I; zdHt4esQC%Uu=@$WIh(FR=`8+OS*KRM>dq^=tPhQaP$Zrp(=xs5-&wt$GlTf5ASRpJ z%}+vvsOEBU+okiJJQA1$G1Srj^;X8Rp)%RwOadv;LC(rmPhXW;mL|vLrCW8Ft0<9@ zgM>Hy-!&95KTmDZ=3{T5nX}GpI)a?i29E4TdI>yXN5fS4#zTi@vpkpqs|pbuIS6>5xLwCuU#JP zwz}4ISjCpAZk#tO0~Sw#43nmeB84b)C(g628y8b%DRtABC~wy-)Nk1->C_Ry!rg=? zo>Dl#1sunLDUrFhiGcZ9jUc@qXJBk($e<=_f1gK~tbB}+4ih%s&W~WIr<~D&_@BPF z*M-eh$ukzgDNV*-b#M<2yI8qGaG$D`VcZV9UsrHYvQzx%~+=hr2qZYgrWr*#81$tQ$s!@;e74p-f^JtfR zCKGY@<3FQ-Rk(0e^`emnc&J^XS2MFOQR{R_bei3WLYaJDlUy*+YDXivB8k>U9?Fi| zmVmjEvJTp_P50XMS2GGj?~)DhUr7BJs@Z=u&zSR!IlG?fJn$kX}j;eD~k#3P9qJo16GRGLqk4 zUd(sV&c(O359;3+op0C6^A=Xyy&g;;)BH8x1;;J=6Py53u}DvCg>6NR~Fl2^qiPxdqe6|&DPUkw|ix>MRzd`G7<{&J+omA$A7b7 zevy)}8RF9ng4j}f>qaGY7mEXTSLTqq2Itm2+(xAv3YeGE?gCpm8{Q&nXVAp!T4GYE zURhTVSgKgWeG4tAhy}>JHmJ-99=ETSWu1%C^Vrff$SerjG7Q_ zchN#AYxtzcKZvUQnEyjz>n2RUa^^ukAMC=bP!MB6&hZa8ZU}dlW?xFn^%F{i%KhEr z7`^&(I!}J=gk76E0vUUHzDWh*+~~dy-io#W##ZAO&FO>91FQ{r3JZn}lB78|yT$qgPwCcXCtXTX!S^M|?kXDe&RCG4DJdbjNCq+|EwT zb6MSMl=(nus9-E{D&qke|8F3y(|0tWO$AqgF`LT-2girHuQg!HBQw=y6YmUoBF?Q-xX<-NESHnL)#Lm+ zXp37=J>2l-GVgXy(3%PI<+`kGua(ZyO8(faYhq*wopvbPZ%o8VmpPx74d`i_TrA7y zy`XAoo)~_X?7wk*;TSjcc3Eg!$$||n^XO`*jX`;aa{gwzU88g^&ztk(G_~wMr}IoN zAPHp=zvZfA9iz97ule8YmLXbh*tnMdeF11~Z%a@Cb`B1Er9AKBCJg=}Hr^U;=4W)F zg+X~wedj&$vLRFVc#|@Eojyc&;A3e2%6HIl_>Mkq$xDD?ir@$qg0tS(&68w z;ygpsCV?oN=q#ZC*aezsD*0KPp=TOS$~IjH*CCsFY%(Fc0c#`iV3EpoTWnh6)>03c z{|M?WSP=ps67ui3jAc9HO;whmJ^Va3e%`P<*a4BE7tmwB#U59aS}>hZ0>5OLT8_ z>1p-Ivv=|ec5nNKMPRUb@PV$>lPiUwSK>2#NM!ZL@aIsuWW;;cK)d%Q4cFG8;I_o||8mZoLm~)?cq6vfr8DYmpIceI|9d z-J}mu@bUjSr(+B^b~_kB*&!dqP$bXp_q&mXJ>p8Ypz}QaE4q&JAx)5t>mOiJ!Leb+ z|GHp+LpC!xQ$hxKwlA)u9q{3nJ!k(8r{PFtNr_P?=M0}CU!PwV((TTvonBmxq0pS( z(taY`bmY-XYYSxXUF95xbN=15g)%zy6@~@kQ1)p$;_aRotdEtfU~p;C{e(7E(L)Vt znzfw1X!v`J)G$#M@Ab3|E1w82@(R}5!?KN{9?_do<>(4fItb+XidLG@XkE+6<(59D zoks`w&;+O1kDp@3FCBG^!C%s5tPG#Vt)j*HnHpE!f(d@X(Z+h0V*F6IOAev#v&Y)k zvqL1;VZV)jQa!mkR*|q5%CmXr1D^R3+WGo@8gsaB>{cfpKT!~#aRX~d|QY-aR zKB&bXsjNn@Zqor3@fC=4*e4`XB*2jVVN&De%)f+uA4$qKq;ND#N5adfuJO~8RFhYG{ijP>=vo|vvJJ}5Tdx2qQ zbRTWM8KtYV-U(#l01ryBh-~(X;LpY5 zZU_wL!&MkP<>*rEyxt1%Vicdl+}<%w#L1w1+-08`*!D4jb??7tztKoQWR}5Bhnt)0 zSh-==BaiLAhCtMa+p{-m{Qt|E_aJ{&>Zs>i`3%&p$7+>0LK_*_>B6u1DQKpn_2iT{ z9GYOGPMWv3g>^xHRoEDp>sC>42F)^Q&K|gx4&GD;=9uvI3~G@79X;LL5af9C^a4xl z$IYv;tk6wMWo>Gaz8F0%$uO-1)A#rIQs1meSd6?2JBb|`SMO(D-}qCHhj9RqF71{W za)3RrnkoHs#P(_o&cWLdjD?ouDNF%#q8#e_C4VIx3j42GJ+$XiFJEW9xeU1~J z5&Oz%vUWs*8Bt_f09>Wq4bk1Vw0H}xT?!=d3MR=}xs*2WN0Q!k%2alE!gEhU61It|OXCaxW_82^9FDQ3GAfcWhQCmfoc9YezHg0=|q~tO0>+u>_ zy_S?(E&EGR{c*)f3(<0{(>mHM4Vt@krAjd%|Lt@Hs7j*1ndHcoIVA%U zqf}H}d1b>y$)Le<6#ZVvS%81sabO(s&XX`kH;Pbe-ofI$SC(q~-QsW-2q-dk;<=;XpRZD(k=i_yY|9l|qaSrkraS6QBb_f-j%8VBDYuUgn;I_XV9%f~=6S za$xmmMFyd}upqoFU%5)K{mgJOYC&OsU3NR3Q358-zgeWS!7o4@+YRXa-@x4^&3i(? zSDeAEM_m41Lo;Z<<5hh!Kav0xNAUr(?$(jYASq9-@>|S!eDrJo?i# zLpFbUO|>*-Sm!S%pJ3zvydJ=EDF>K4vZ5l!331FpqW3^DjzvNY%dH%O54FFQ5Kk$w z7)~}{!Mg;z9nd(YR8Muu-82CALpe`Craxyt+VDg7W0%$eJn6BAWvSe01-$a zXHKcCg^TI1@#?paD3lK{jU9iEyD%a2vCd318g#V>>#>*vpa+nVLPgFgAmJkid{%~U ajG#mFogtq8I4BAcV|rp#1FO>W0000 + + + +Mean Johnny Barrows + + + + + + + + + + + + + +
+
+
+
+
+

Mean Johnny Barrows

+
+ +
+
+
+
+
+ +
Mean Johnny Barrows
Film poster by John Solie
Directed byFred Williamson
Written byJolivett Cato
Charles Walker
StarringFred Williamson
Roddy McDowall
Stuart Whitman
Luther Adler
Jenny Sherman
Elliott Gould
Music byColeridge-Taylor Perkinson
Distributed byRamana Productions Inc.
Release date
+
  • January 1976 (1976-01) (U.S.)
+
Running time
75 minutes
CountryUnited States
LanguageEnglish
+

Mean Johnny Barrows is a 1976 American crime drama film starring Fred Williamson, who also directed the film; Stuart Whitman; Luther Adler; Jenny Sherman; and Roddy McDowall also star.[1] +

+ +

Plot

+

Johnny Barrows (played by Fred "The Hammer" Williamson) a winner of the Silver Star is dishonorably discharged from the army for punching out his Captain. Shipped back home Stateside, Johnny promptly gets mugged and hauled in by some racist cops who believe him to be drunk. Unable to secure gainful employment, Johnny finds himself on the soup line (with a cameo from "Special Guest Star" Elliott Gould) and down on his luck. +

Walking into an Italian restaurant hoping for a handout, he's offered a job as a killer by Mafiosi Mario Racconi (Stuart Whitman) and his girlfriend Nancy (Jenny Sherman) but Johnny turns him down. It seems that he's not slipped so far as to start doing odd jobs for the Mob. Eventually, Johnny lands a job at a gas station cleaning toilets and scrubbing floors for the mean penny-pinching Richard (R.G. Armstrong), who receives a beating for ripping off Barrows. +

Meanwhile, a Mafia war starts brewing between the Racconi family and the Da Vincis (the family, not the painter). Seems the Da Vinci family wants to bring in all kinds of dope and start peddling it to black and Hispanic kids. The Racconis, being an upstanding Mob family, wants no part of that on their streets. And so it goes, with the Racconi family wiped out in a treacherous double-cross, with only Mario left standing. +

Nancy is kidnapped by the Da Vinci family and gets a message to Johnny claiming that she was made to do "terrible things". Brought to the brink by poverty, The Man constantly screwing him and his love for Nancy, Johnny agrees to become a hired killer for Mario to avenge the Racconis. And so the body count starts going up as Johnny in all his white-suited glory gets mean and starts killing his way through the Da Vinci family. +

+

Cast

+ +

Additional notes

+

The structure of the film was previously used a year before in the film The Farmer (which was shot in 1975 but released in 1977). +

+

References

+
+
    +
  1. ^ "Mean Johnny Barrows". afi.com. Retrieved 2024-02-02. +
  2. +
+
+ + +


+

+
+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/data/processed/wikipedia_html/tt0074888/La-meilleure-facon-de-marcher.jpg b/data/processed/wikipedia_html/tt0074888/La-meilleure-facon-de-marcher.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e66679c94941db42cbdf8a374eea216190841fc6 GIT binary patch literal 10766 zcmV+pD)H4)Nk&EnDgXdiMM6+kP&gp^DF6U4ngE>vD*6CS0Y1@IqfDqIqoSmZ>Egli+iqdMLxJ{F}0d_yMIL8b{Vvq z-~%xq(Nj)2R9IyymyT3s`AU>MmEf6}h$J3nFX{TxwB|GWYnhFh^9vQ2kO$per6roW}gY(HgtM?xwF zju^mwG<-?zYhf%AgHzW}NnD)i0WMj<^`aTY|Gfjnjnca3f zUU-;(u;A@~~dk8S_8?{hC!B)E|rz84P6YLD3t;UQ)TFsH*AZN@=!S z|MWM}o3{H67K>s35bNO!ECejp%ii;zNo@h&8z&N@+|rMeqBJ5*mSsqs#VSQXlAhY` z-MI0ujO(_58W_NenuV7Hdnp~SsmlicWY9P%NW8o~!gC_3mc)@Ia%G-6uynYFg&fi) zF?j>51){TAou9O>>vJD#LpHcpL5y-R z%J?Oy+8(lN*W+#XQJDle)Pc{-9r>3xC{;^0Uy-<*GYmU*kLEIV>Gd3#5s&q+8CWiEk?fTelD{gKA{Z9PLtaVk~$Y?#;^NB&%yLQ^vAp| z-^a<#xx0)|Fu}IVcv+{WRN2cBNHMOCEHdkn%_VHON*!#z-2j8EZ;J8hHt8Ir2DhlR zvbDV!q{2Vq>5c4jMunYG}14w;7c#RMe`M$(Z0XYY6)P+xC9%=6j3& zd^fh^A>WmLU~N*m_#j8To3Wqe{5*v04J4Y?6F2v(c+gpG1`HEQ0W zUW8%V*fSXGZYxi}cZ7wziJrh>!iQd3 z(%LREhxBrC5}_Mq^3}2GIoSl{G(}Q1!%`4;DW7@zFkq_89h_ z-ESOfiD`{XGIHo$vd*2#4c{RP+dl+5SZCJI!s2zYKl_&B$_$JzYgf)my-3SmB=IS3@6H;+Rz488 z+Bb^~n7M}tM1|#B_iM|P4cJn+^cD;DRz+LE9JRLyd963*=_|ED!(9fUZey=O$mBd6 zYwZolPe8*9{2ljB-aq`9E(I;;alaol4D>#9E&;DQUc;*zt%$&3b@vU?sU@e&hFz0U z<)G$JO&YCzB5gpnNm1zps&Qy`})*PC<;O10-aa%Lp+~>qTW_DvGFFth@ zjiuT_(JhNRa|#OeVs|dBScOcFw{w6;}h52D1_EDqN+E8woqW0&u@7*7Rv4zI=byg)U(WTRT?o&_5K$9!ZC!dwba6`$f4V z^N{V-(@J?uv}_-debfyfUk#<2NBnFk4i~@@>c1=Q-9k2vJ>@qL|34l%{1WsCnbV93 zMglpY5Ilsw9nbv~8D-oBonM)VI_qV4Y@_%Q=wzpO!0nwe`Vbn$jde}ui^T-eiO)-3 zXEY4Lx8Z41BInN1=ty_5G3zKoh)t}>IU*98135l%+5_%XhoQ$Qfpe9rmf zbwE_@3c;pY>{aL`O=|6Y^-8|aNHtRM2H*XJ)qRT)&vu)19^J*a)}LBW+wTStj}mWw$Xusb~Tnr!sl%4UoqX~zdRW~f5M<6gm_xXRc`xbS@NcVqd z`M}l9)%!{tQxffXchR?ZHY?fh`GWLM1B?W6wLEcDhzR`8W0t>wIBLT>S?n?-j(&c+ z(BRuU?ThVV1IjH*;2HecRa?7m!L(ic0BA1n9m%7!14)o`3L@19Kj`!M8SWu_y^T^M zwT^xg48$xjqimH^{rVa@V_J=M`3g{%Xy!OZ>lFJ`c7EJC_U1pT97s~tP(LOpamA@H zt5lRQjws!D%Rr8>&j^kB6gUG9L`I28bw%*Myc3s}F<}1!<1`B+vT`li#Mb8! z-wb}~y)@6oVFPA2a-;)O7)zk1Y;2;QxNw|0xYS-keFxT;A{H}z6r z9j5FCGghmuS4mYKjwssiA}=Xm@i4G336SO+t-5+1o7@7{ig(SqsnuEjY3u?+=h?Fw z&W%FRR@;uK^%Gg7>iuic!Ij6Pz{b4hF9TomxX>fzpPDgcr2Mp!tKcB9*_Ybdx&N>i zdoJg=#t{fbRAjDGz>!74jHHkyLAn+L_pLLygTmXspuQdUbo)+6dp)FY>Zw+BRHLx{ zD|!epPi9a5LPnJq8rhsNn~&Plxp?HWEA+DTegHtkk&YkJTv1i43v&U%uU;wrue06{ zW0A}Mp}FtE(LR8jSOvcLb<`-ycvbqCb~+EW$2hFOYKIz9iP2I7&a~ZsWTl`YhHM|H5f5m#3=g(;_*ou}F8WuRoC}Ir~teTfzeb ztfmlc#o#i-k!QS#3V4`(FBLnYy4UOPlHu?(WcsolNB$AMk13MpyP7{4WDT7G6x@ZyY(1 z6G>GvcA_j3g3A)qAMP3MT-~lrsem4gv=i0gix42r=>d6=Uq?;cU|y00tsf@gN4ix88kQynfn5G&I|N0G(6CZ*Kb6hE(!P`Qm zb}@j2`#;%PBdOSeelhqj{zzk^qk#~VK4kz}tnz;#`m-e-To-%EsQzsK%fRkhfNX1z z(L3Ju(VOu$s+Yzp*?@sy!uEET+ezjVsf9~o(pLI5)}KbjYbEFD5xK8m-T{KWO;zL5 zc-iVY{9x)JB;YV@XX3r4BT+}Vw*@S&+u5<)Mr5`+=Y8?DQHKCIyy%oSVHqZ$8 z(`rtm1c182U9%&=$L|`^6yZ`f&|@1M5>*o}1Mk7S7Y35Mo*~aTuxwknp2`1+5*M1p;i&|}P7?t<8aAb<@X1%^IRRpEn z-o3oxXPB#```5<*-ZdU>7{&aXMAndAFrnyrdnFLq7bp_&jZYdq!#J|C=vW|5q5(7_ zFtkR$Pymqqv)vT2P-GFOQV&7fP57^<#HFq5Kwb|4-bOGnX zE>Cs6rh5*Nb_Kcl-F)YXb?=1 z*~SH}x<76(O!cS0PD|BpLt0PWTCY^g`?V`26gUCt-nEj{X7#50Dzw7a^Ux9*jbB{= zVAEDwgTXEQ4ZQbrGL^Sm=w7!W>y6#P~B#wXOpm>A$a=U%v%2g>?CsEGFT4v@-dC! z8(@Ke&zI-b81sq_X_>o&)$7?GUrDJ|KYLae@)-kmCWRG;gmD4>@x`suUdSM#GR3!brhA$0s1;NNeWX7hwj*@vL_Ym!b(E zpO{(&h$&?Wvh?Z{s&eVO^mX!mZ$jnF!6t9S3ZXx_I9WWYf)Sb0mS9j%>C73c0(c2^ z<4vR_wLHp4BdDeCNQAgZww1B{BcjL!b)x{94uR(X^3sIF#;_G=VH#uT_t4U`sDVauH8!9UPO9mjfRajM}b(CSehWq2 zdE5BEyh)xr7$_^YBA2$7D=>PS;_}M|*#mR@EF!o;*U?w0a5Mh>?$U+kGUSC!8=LJp z@ihfNIYKx)(5wFF>EkbYJH&in%Yuz`LOH`-VtwHlROmm}Q6m+;4un+}*z~5o# z*y!8O=ibgVzzYGg0DiH!e9V|e!s(yp_KP1N5K9zGs`$1clJ`be)bS&K_^MLB1bHFDz-RMAUqE_ zhte&0Xlw@qSuOPd$z>B^2-40p&s7EL&VL+}dzpPKV3-&ZQ>PN~`Qg#qurXs(pByTy ztJ(1O0>pGruwqG4bj`|tupnnJ5oMnI#Wrm-oG2oSPd0=6pWGQAJ`jfi~emR zWd`F_Tos#Wyt@sFP_@fiTB8%_H5kfCoGz+plE&FzD~%q#1hmh4EXVI}X^^XK@u?@) zdzOJa#o1@n#;M(hYnX9bPu&u}Iakpv2jS-js_8BJC~Qh|qblQv&2;E&qP3tm`IKw| zA3%y&FgDjYYogQV+Nxuc?l59aAXe1lovr!sAj-gRT7{occkb4@K0*Cdx9(}D4PX2T zFmtnGpKvJ~K8sIgn!$*i^Ok-FjyhP@p-Nwu#=WZDbrjZ=Q=kg@-qch?iRcL2aGV>? z7WZP#JiLecqMz7Cbp^9lX&jW+(ibBD_z+y(0%=Va=kzh`+_4PqyRktM+JSRM;Tc@^ zMS=TEnIZkoHG)tHA{yZx74pw55*=g?$;`hX)|V?PTrD5Sh|%M>-D{p~YS@BrtF6eh zOqsXnL_s4pUU`(BRe?q{DVYQGuH(P!zKfqu^b6tJ!8q>>&#^KkybRV{k=3>)Y*(a9 zRg?6E#LgC%=M zqWy`nGT!VSffpOF9jrXmh?6YH1}EgmKrM_LY~Q3C?v7sbLA-8IT!Ezp>UhsKrh6&R!eI7X>mY zMjgOgT|?-AM$hkXHNk(e$nJVuFyW|-!rl%72G`jQZZEm37ojhf#p>+W5pguH2w{&A z{A|OJbk*DS8f4yqF&$oz2EKSgn|FLiqCoIOs}p+lXe8#K*d$TJ4?d{ZrH}SmD3C}& zM2#SNE`PK#7t)6LYosus$_oY-xSs`*wUe%C_6CNweE4wI0N`=G;?8>@m+{!kgRXoE z0!x(H#jjz)GgKWC)`7;mZ|y5l6C$lnzLoh(wPS27zU0SbnxPMhyswTrKpn4t&qJXO z?r@ixvs1Es#)@NgSTBg-!U0XRzLMBlhuygZ%p>4c{oH6^*Ya#krb7v1tkRsNTDcfZgV8 za`AEgV>UH-BcOoYdC(nh1CL{b$mI?u{-_@&HVlb<_8FN03Myiz$W-XSo{-w> zV6($H(0NhJCe{gsvpljzqOElcs%enFcw%%9rj1nEbwD^tA1&&-9|ax>_S@V!DE#Y`-Mi-PW>>~1 zK4AiV{u3;1d)xBCd$VCcD4evXS!NL*FROb=>Y}Gui?< z2#+0ThI_`RBX+PF5CC*q))Pl#$AqAeVsP6h&etdf;CHcEks$n6oy40w!w-!Sbr* z(5rYXJ5*}{8zc_}GXYAq<;0QZ=AvGU6*(BjQQBKW#JIJU!|3=5!G<g6PWii7SeQWt?g}~3zU_3*1~mp$r4F_`J%yvy(zo8POWAT`f=?+Tb=}g*-$BkDV`nD6J@M) z>?gPSuLX>e9eN|Rn`!~)-=)|89KG*(sEnv`9WVO(F|{$$Goo0sTt9wdn6^0pLk+w74opfRD*`tTF?V7?{K+SMaRc*#WE&Fbo}A3Pv1osm zwKfkCWXmhTm9ED5O_7>&y!%s`hSPWhqis=*Nu!7xAA5jj?rB-M^0!km~u z)>6U?-sPPk$Z59T-YF6s3cXHh3#07(QxUOm|E4EIhcF9k-j@<9QmQoGqQ9jON7oQt z;blF=m`03=y{I8vJ_Z8qE144FYSrfrr%-%~V(%1^MlS=^T$ey4u)Cq03;q$}BZOE8 za~itBFjie%U_~JClzGxZtZRCjeebO4?*2{E+w0!MHGB-)SNgdyfQ>rpr9cANHkwv> z4+PLK>>t7ayxmUdifGR-QTifaWzarw>;H^Ov*6eUxtdy5ydrf&xQA-99{H+pk`qHQ zWI@RUNS)ivXhkn1ODJoFJU)oL-ba51TQgl2ibuuh{}8TYa3#K_xXf9EwW0>-mYS!z zN!3vj&V*DE&F}&Ni8B>!X>yo<6l<|xm$FJ11&!d25`({M z-!FvW=yuB2thkj@XUB3W#O!@U$^rAlEMwO%uaT3qAKkE>LRut*OW2SSs;Qq(;7yI9 zy)dJt6Fh}TQ9Ns`W-^L^#h&qlcOe|=la=!eP1H6u@yjUT(0MfQ+ID zjAO4WS@+TO9l7|@kN4G3w*rVh{cygvy#(K4ZXlD=I{NOh{~zpheh5qRBkOPpb6Wk# z9Q*bG0=|QWlt*K(8e}3$#Ao{ryvB42(SGdq`e_!rAekd+QiPn}$c95Q+_~AN+L(b-T@3YnooCft0mXZLxrq~F9o2_6ow^0!InpSqKlljl^u-qqJQJG( zGX$haio3@QzbDznsR8VRc)@wNTSEx4t}t=e=+&Y;J}F)+c?qV+8~b5=<$_xIga72i z6Vbd9q!=UU`_B}hh#lKedxL%ta5&-BGaLQE(DMI1jD78_8n4X`g&#lGrX3M5lMi9_ zR$5^035%iE-RI*OTG7!}FhTxJ)!hW}hSdd3xjSe6_H?k1GA9A^5;%B@ePL(K%vfCd zyQL;QKSfWuu#9^w5aDdJvbN77?OilT(u(Wj)ZuA7FG}Wo{TH2Q&@U|CvbSsSg69^@ z@jq^5&nv=rPe^3%a4bUE>YYKo2%y#8G0q|_){Q%&AzIN%IIxvr1P%UMlrR}&Wg#Zn zX}S*w+g0N?7oU^RqtoT>OhQJcj?t+yYM(Pb$uX%r1bZ?D9IiUk@>1aF)Ie};Ah$o( zL+xmZYjsv~FL#p}uX-41qETperAPy`SMsQFw(gUB=5JRlY@m5Q7g(>RGv{cnB^&w` z-uhq=kWKMi_oOxJ;t==~*}92p(QUM2t}|T-DO@mRN7)QkL6P2&*#ZP&^^6gxHO2d{ zKS%GRRW#5yv$zQR#KW}Peu2vwNzGdGa-FC3&=|CNBNp@5GYY>7#2!=tSN)S&(!I{} zWUiX9l7oX5Ma?Vnek*1JV+QE>1|eWn)P8#DKva_fO3V=HW|W*99_iFqopIJl!uhG% z#I0K_dLmc*VX_>(Sb5PBYa{~FScVU_*L(R}M4EA|wvcs>AkoknrEU5yp5UT3i=2@b z4?oJ1WK22|Xpz43^s>+a>MS=SYV_+_1Hb`wa3S7#oYN++-yT=tln1Ag{DP^l%s{Ua zw0ezKzl(sRqYFO3i0za#dk~(vPxt|Z^qZTg8m9N#pYQ~IAiQy>kp_;P3O^gjtbqGb{3Dqy#jj)|}_tIR1@l&0n7pA!Czq%%vjsxO-U-=jZg;6=-j#lSu{${Y1yN&wYT<=GkB zI&V;SMT zhDCp1OH#l=%1#DsJDF}6AtnMgU3XEY^lnd}mkCdj=WUb{@LF#Z3G#H_UZ;Nhx@rGM zw=Y$H$SryWse>)5>FrHeC-DaA_#A>oDF~OkVKrs>MjD`6Fi3Fd7tXxw548g?mFost z1p>pNRHm;+Ym_5p>F=jf4_;fMN_JqP{q6AD55LwDdZ+6YmR z|NokDkZ#IajVUZJBwvxnJ$TW$^m@%;c}!1XSoK34U+A{{p1ZGuB3sJi?|v)08OhAJ zzVf_6clPX2kJl?tS;)yqxc(bg=wHKNYU6v{LYvH}JpM<&R}4M;p;=7O?aw8C6W8Ih zmo6*Z4a|8KfWR<1g1gqNDS|#2V^>frpb$&_8)LY^8P?6YAKQz#AVl^eQ!Fv>En8ME z$jh+g&><_a9bRyfER|uV!3gx!F<_*eQQ9w%D zD`k1-FY8l@UVQF>h<}w*z?~6d!{3IWBBt*fEtv@38|3{`j87j66TBL=!KbD6`; zPZeO@^7_oIwqXGF`Tzwl1^@A%ED1zfX!%(MV$ZfzPH_0MFALhr(@5RZ+6yCzT>YO- zY_ckLt266XyZ(e%8EjE|VpZqT?tGlEK4yC>YVn(@m3PN;ZG$ee+#tG4M6U#NJ2#Zi zT~WSbZ^%+5-|4O10F6VTb;=ZYdg>g+raq4N#Kv;%i>a}^Dn%U7y zz$|zDc`-^#EuJ(K))Yi(Diu9UfqaUU;X*3kh)PzlYbvf2xc1mo$I!8VRCQ6P>t_O5 z``p5smRzY>P0@u0dp$bE0h3=tO*j*bv9POeYFs`&Y*ksrxqAg&omDaYc4@?irp~%2 zFoltfmdBmmGo=l(5#AJWePSE{Ee2oF_c_hPh2o%0JmRC`7}KAh9{}+3nt|9qvc2M~ zL_kkb)V{zwyBfFxfD2nb%{{b-cB;~+Cn5p4nJ~LBsd}#%T^`yiQ}1u%&}$q7uvVSV z+I?z0G?}>vn*wjwA#E8beqP$gXe5KdP`c)_*K-WOV-g|$6l!(lC(ONdkTke$*jY>~ z<=j32D!c)UDfAQY)Nd~_$ zy+J8nx253sl5vZY)yhll5_U;Tiz_n&yqTSR4HT+}4+j23e_$J(JD9?T>TA)EP#KGh zTqFNvx@T^HgK7S1q#^dpLDKi*WF_m1N4IUnwnIG2xk^QxpRfhV&5bxbapDI7XmFWj$Nvx#|LD2glet5#nu#qjkZlyQuXK{$DELDKMhA~N;z zPpP+Im(cwSMrA%tNPhZJw5dp8nT3enKhEr4>tBu>&<`;5?(A6FdEw;bNGRsU|D=6m zJ*@}0wshN~C&Z8LO1bOedBJ8q>}z_L$L zxrkOI{8Ft4YQtZ<_P*z4A2KGCZO_iZ&O`y5{OpZgW=dceUWzHuu>QZYhrf@uU`aMj znd4Yny{Km>!Z|l_Hx?30Eua7ZULXJfDpyj-K1((2QbvJxxt;AKxzg^e9=c75wAi2Sq(acq + + + +The Best Way to Walk + + + + + + + + + + + + + +
+
+
+
+
+

The Best Way to Walk

+
+ +
+
+
+
+
The Best Way to Walk
Theatrical release poster
Directed byClaude Miller
Written byLuc Béraud
Claude Miller
Produced byMag Bodard
Jean-François Davy
StarringPatrick Dewaere
Patrick Bouchitey
Christine Pascal
Claude Piéplu
CinematographyBruno Nuytten
Edited byJean-Bernard Bonis
Music byAlain Jomy
Distributed byAMLF
Release dates
+
  • 3 March 1976 (1976-03-03) (France)
  • +
  • 15 January 1978 (1978-01-15) (U.S.)
+
Running time
82 minutes
CountryFrance
LanguageFrench
Box office$13,793[1] (2008 French reissue)
+

The Best Way to Walk (French: La meilleure façon de marcher) is a 1976 French film directed by Claude Miller, his directorial debut. It stars Patrick Dewaere, Patrick Bouchitey, Christine Pascal, Claude Piéplu and Michel Blanc.[2] +

+ +

Plot

+

Marc and Philippe are two teenage counselors at a summer vacation camp in the French countryside in 1960. Marc is very virile, while Philippe is more reserved. One night, Marc surprises Philippe dressed and made-up like a woman. He responds by continually humiliating Philippe. Despite their late-adolescent rivalries and sexual confusion, each achieves an awakening. +

+

Awards

+

The film won the César Award for Best Cinematography, and was nominated for Best Film, Best Actor, Best Director, Best Screenplay, Dialogue or Adaptation and Best Sound. +

+

Cast

+ +

References

+
+
    +
  1. ^ "The Best Way to Walk". +
  2. +
  3. ^ "The Best Way to Walk". unifrance.org. Retrieved 2014-03-10. +
  4. +
+
+ + +


+

+
+
+
+
+
+
+
+ + \ No newline at end of file From cfbddf2a24ed244ba9fe47f693662a3ba3f37705 Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Tue, 10 Mar 2026 14:15:33 -0400 Subject: [PATCH 5/7] - Updates to make it name the folder the name of the wikipedia slug. Fix needed: Certain characters can't be used for folder names. Need to fix it so those characters are removed. --- scripts/extract_wiki_zim.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index ec7854904..9a7b80fa4 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -1,3 +1,5 @@ +import shutil + from bs4 import BeautifulSoup import os from libzim.reader import Archive @@ -5,7 +7,7 @@ from libzim.search import Query, Searcher import csv BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.test.tsv")) +INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv")) OUTPUT_DIR = os.path.abspath(os.path.join(BASE_DIR, "../data/processed/wikipedia_html")) ZIM_PATH = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/wikipedia/wikipedia_en_all_maxi_2025-08.zim")) @@ -44,7 +46,7 @@ def fetch_wikipedia_html_with_images(query, save_dir): with open(img_file_path, "wb") as f: f.write(img_bytes) img["src"] = img_name - return str(soup) + return str(soup), best_path #Go through each row of the tsv file and try to get the movie on wiki with open(INPUT_TSV, encoding="utf-8") as f: @@ -59,15 +61,24 @@ with open(INPUT_TSV, encoding="utf-8") as f: continue # folder for each movie movie_dir = os.path.join(OUTPUT_DIR, tconst) + os.makedirs(movie_dir, exist_ok=True) query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") - html_with_images = fetch_wikipedia_html_with_images(query, movie_dir) + result = fetch_wikipedia_html_with_images(query, movie_dir) + if result is None: + print("Wikipedia fetch failed") + continue + else: + html_with_images, slug = result + slug_dir = os.path.join(OUTPUT_DIR, slug) + os.rename(movie_dir, slug_dir) if html_with_images: if "Directed by" not in html_with_images: - os.rmdir(movie_dir) + if os.path.exists(slug_dir): + shutil.rmtree(slug_dir) continue - outfile = os.path.join(movie_dir, f"{tconst}.html") + outfile = os.path.join(slug_dir, f"{tconst}.html") if os.path.exists(outfile): continue with open(outfile, "w", encoding="utf-8") as out: From 1614d85270f4b2b845d569416e481fb079cbe946 Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Tue, 10 Mar 2026 14:45:45 -0400 Subject: [PATCH 6/7] - Fixed Bug: Certain characters can't be used for folder names. Need to fix it so those characters are removed. There is now a sanitize_slug function used --- scripts/extract_wiki_zim.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index 9a7b80fa4..d029d7870 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -1,5 +1,5 @@ import shutil - +import re from bs4 import BeautifulSoup import os from libzim.reader import Archive @@ -16,6 +16,11 @@ zim = Archive(ZIM_PATH) searcher = Searcher(zim) print("The Zim file is now opened") +def sanitize_slug(slug): + slug = slug.replace("/", "_").replace("\\", "_") + slug = re.sub(r'[<>:"|?*]', "_", slug) + return slug[:200] + #Fetch the html AND the images and put them in a folder def fetch_wikipedia_html_with_images(query, save_dir): q = Query().set_query(query) @@ -59,29 +64,40 @@ with open(INPUT_TSV, encoding="utf-8") as f: if year is None or titleType != "movie": print("Skipping from TSV: ", title) continue + already_done = False + for d in os.listdir(OUTPUT_DIR): + if os.path.exists(os.path.join(OUTPUT_DIR, d, f"{tconst}.html")): + already_done = True + break + if already_done: + print(f"Skipping already processed: {tconst}") + continue # folder for each movie - movie_dir = os.path.join(OUTPUT_DIR, tconst) - + movie_dir = os.path.join(OUTPUT_DIR, f"_tmp_{tconst}") os.makedirs(movie_dir, exist_ok=True) query = f"{title} ({year} film)" if year != "\\N" else title #if year not empty print(f"fetching Wikipedia HTML + images for {tconst}: {query}") result = fetch_wikipedia_html_with_images(query, movie_dir) if result is None: print("Wikipedia fetch failed") + shutil.rmtree(movie_dir, ignore_errors=True) continue else: html_with_images, slug = result - slug_dir = os.path.join(OUTPUT_DIR, slug) - os.rename(movie_dir, slug_dir) + slug_dir = os.path.join(OUTPUT_DIR, sanitize_slug(slug)) if html_with_images: if "Directed by" not in html_with_images: - if os.path.exists(slug_dir): - shutil.rmtree(slug_dir) + shutil.rmtree(movie_dir, ignore_errors=True) continue + if os.path.exists(slug_dir): + shutil.rmtree(movie_dir, ignore_errors=True) + else: + os.rename(movie_dir, slug_dir) outfile = os.path.join(slug_dir, f"{tconst}.html") if os.path.exists(outfile): continue with open(outfile, "w", encoding="utf-8") as out: out.write(html_with_images) else: + shutil.rmtree(movie_dir, ignore_errors=True) print(f"no Wikipedia page found for {query}") \ No newline at end of file From 525e359c6b59db7c63a18f488c950d57294f4a9c Mon Sep 17 00:00:00 2001 From: prabhaavp Date: Thu, 12 Mar 2026 12:14:31 -0400 Subject: [PATCH 7/7] - Html -> TSV --- requirements.txt | 3 +- scripts/extract_wiki_html.py | 115 +++++++++++++++++++++++++++++++++++ scripts/extract_wiki_zim.py | 6 +- scripts/rank_cols.py | 63 +++++++++++++++++++ 4 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 scripts/extract_wiki_html.py create mode 100644 scripts/rank_cols.py diff --git a/requirements.txt b/requirements.txt index 7b5077898..94ce5a178 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ pandas~=3.0.0 dtale~=3.19.1 requests~=2.32.5 beautifulsoup4~=4.14.3 -libzim~=3.8.0 \ No newline at end of file +libzim~=3.8.0 +python-slugify~=8.0.4 \ No newline at end of file diff --git a/scripts/extract_wiki_html.py b/scripts/extract_wiki_html.py new file mode 100644 index 000000000..c6cfced93 --- /dev/null +++ b/scripts/extract_wiki_html.py @@ -0,0 +1,115 @@ +import os +import re +import csv +import pandas as pd +from bs4 import BeautifulSoup + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INPUT_DIR = os.path.join(BASE_DIR, "../data/processed/wikipedia_html") +OUTPUT_TSV = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata4.tsv") + +WHITELIST = { + "slug", + "title", + "poster_filename", + "Directed by", + "Produced by", + "Written by", + "Starring", + "Release date", + "Running time", + "Country", + "Language", + "Budget", + "Box office", + "Plot" +} + +def clean(el): + if not el: + return "" + for br in el.find_all("br"): + br.replace_with(" | ") + return re.sub(r"\s+", " ", el.get_text(" ", strip=True)).strip() + +def parse_html(path, slug): + with open(path, encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + row = {"slug": slug} + h1 = soup.select_one("h1.firstHeading") + if h1: + row["title"] = h1.get_text(strip=True) + else: + row["title"] = "" + # infobox + infobox = soup.select_one("table.infobox") + if infobox: + img = infobox.select_one("img") + if img and img.get("src"): + row["poster_filename"] = os.path.basename(img["src"]) + else: + row["poster_filename"] = "" + for tr in infobox.select("tr"): + th = tr.select_one(".infobox-label") + td = tr.select_one(".infobox-data") + if th and td: + row[clean(th)] = clean(td) + # sections + content = soup.select_one(".mw-parser-output") + if not content: + return {k: v for k, v in row.items() if k in WHITELIST} + skip = {"references", "external links", "see also"} + current = None + lead = [] + for el in content.children: + if getattr(el, "name", None) == "div" and "mw-heading" in el.get("class", []): + h = el.find(["h2", "h3", "h4", "h5", "h6"]) #assuming no more than first 6 headers need to be looked at + if h: + title = clean(h) + if title.lower() in skip: + current = None + else: + current = title + if current: + row[current] = "" + continue + if not current: + if getattr(el, "name", None) == "p": + text = clean(el) + if text: + lead.append(text) + continue + if el.name in ["p", "ul", "ol", "table"]: + text = clean(el) + if text: + row[current] += text + if lead: + if row.get("Plot"): + row["Plot"] = " | ".join(lead) + " | " + row["Plot"] + else: + row["Plot"] = " | ".join(lead) + return {k: v for k, v in row.items() if k in WHITELIST} + +def main(): + rows = [] + for folder in os.listdir(INPUT_DIR): + path = os.path.join(INPUT_DIR, folder) + html = next((f for f in os.listdir(path) if f.endswith(".html")), None) + if not html: + continue + try: + rows.append(parse_html(os.path.join(path, html), folder)) + except Exception as e: + print("error:", html, e) + df = pd.DataFrame(rows).fillna("") + if df.empty: + print("The folder was empty / None parsed") + return + cols = ["slug", "poster_filename"] + [c for c in df.columns if c not in ("slug", "poster_filename")] + df = df[cols] + os.makedirs(os.path.dirname(OUTPUT_TSV), exist_ok=True) + df.to_csv(OUTPUT_TSV, sep="\t", index=False, quoting=csv.QUOTE_NONE, escapechar="\\") + print(f"Wrote {len(df)} rows -> {OUTPUT_TSV}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/extract_wiki_zim.py b/scripts/extract_wiki_zim.py index d029d7870..38955b63a 100644 --- a/scripts/extract_wiki_zim.py +++ b/scripts/extract_wiki_zim.py @@ -5,6 +5,7 @@ import os from libzim.reader import Archive from libzim.search import Query, Searcher import csv +from slugify import slugify BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv")) @@ -16,10 +17,9 @@ zim = Archive(ZIM_PATH) searcher = Searcher(zim) print("The Zim file is now opened") + def sanitize_slug(slug): - slug = slug.replace("/", "_").replace("\\", "_") - slug = re.sub(r'[<>:"|?*]', "_", slug) - return slug[:200] + return slugify(slug, separator="_", max_length=200) or "_unknown" #Fetch the html AND the images and put them in a folder def fetch_wikipedia_html_with_images(query, save_dir): diff --git a/scripts/rank_cols.py b/scripts/rank_cols.py new file mode 100644 index 000000000..03aa1ed94 --- /dev/null +++ b/scripts/rank_cols.py @@ -0,0 +1,63 @@ +import os +import csv +import sys +from collections import defaultdict +from tqdm import tqdm + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +TSV_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/wikipedia_metadata3.tsv") +OUTPUT_PATH = os.path.join(BASE_DIR, "../data/processed/spreadsheet/rank_cols_output.txt") + +csv.field_size_limit(min(sys.maxsize, 2**31 - 1)) # try to increase max buffer so it doesn't fail +#https://stackoverflow.com/questions/53538888/counting-csv-column-occurrences-on-the-fly-in-python + +def main(): + lines = [] + + def log(msg=""): + print(msg) + lines.append(str(msg)) + + log(f"Reading: {TSV_PATH}") + + file_size = os.path.getsize(TSV_PATH) + col_filled = defaultdict(int) + row_count = 0 + + with open(TSV_PATH, encoding="utf-8", buffering=4 * 1024 * 1024) as f: + reader = csv.reader(f, delimiter="\t") + headers = next(reader) + num_cols = len(headers) + + with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar: + for row in reader: + row_count += 1 + for i, val in enumerate(row): + if val and val.strip(): + col_filled[headers[i]] += 1 + pbar.update(sum(map(len, row)) + num_cols) #progress bar + + log(f"\nTotal rows: {row_count:,}") + log(f"Total columns: {num_cols}\n") + + ranked = sorted( + headers, + key=lambda c: col_filled.get(c, 0) / row_count, + reverse=True, + ) + + log(f"{'#':<5} {'Column':<40} {'Filled':>10} {'Total':>10} {'Fill %':>8}") + log("-" * 75) + for i, col in enumerate(ranked, 1): + filled = col_filled.get(col, 0) + pct = filled / row_count * 100 + log(f"{i:<5} {col:<40} {filled:>10,} {row_count:>10,} {pct:>7.1f}%") + + with open(OUTPUT_PATH, "w", encoding="utf-8") as out: + out.write("\n".join(lines)) + + print(f"\nOutput written to: {OUTPUT_PATH}") + + +if __name__ == "__main__": + main() \ No newline at end of file