- Html -> TSV

This commit is contained in:
prabhaavp
2026-03-12 12:14:31 -04:00
parent 1614d85270
commit 525e359c6b
4 changed files with 183 additions and 4 deletions

View File

@@ -5,6 +5,7 @@ import os
from libzim.reader import Archive
from libzim.search import Query, Searcher
import csv
from slugify import slugify
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_TSV = os.path.abspath(os.path.join(BASE_DIR, "../data/raw/imdb_datasets/title.basics.tsv"))
@@ -16,10 +17,9 @@ zim = Archive(ZIM_PATH)
searcher = Searcher(zim)
print("The Zim file is now opened")
def sanitize_slug(slug):
slug = slug.replace("/", "_").replace("\\", "_")
slug = re.sub(r'[<>:"|?*]', "_", slug)
return slug[:200]
return slugify(slug, separator="_", max_length=200) or "_unknown"
#Fetch the html AND the images and put them in a folder
def fetch_wikipedia_html_with_images(query, save_dir):