diff --git a/new_met.py b/Not_used/new_met.py similarity index 100% rename from new_met.py rename to Not_used/new_met.py diff --git a/part2.py b/Not_used/part2.py similarity index 100% rename from part2.py rename to Not_used/part2.py diff --git a/scrape.py b/scrape.py deleted file mode 100644 index ae3a0f003..000000000 --- a/scrape.py +++ /dev/null @@ -1,68 +0,0 @@ -from playwright.sync_api import sync_playwright -from bs4 import BeautifulSoup -import re - -URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0" - -with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - page = browser.new_page() - page.goto(URL, wait_until="networkidle", timeout=60000) - - # Wait a bit more for dynamic content (Workday can be heavy) - page.wait_for_timeout(2000) - - html = page.content() - browser.close() - -soup = BeautifulSoup(html, "lxml") -text = soup.get_text(" ", strip=True) - -print("Rendered text length:", len(text)) -print("Title:", soup.title.get_text(strip=True) if soup.title else None) -# print("\nSample:\n", text[:1500]) - -def extract_job_description(full_text): - lt = full_text.lower() - - start_markers = [ - "job description", - "about the role", - "job summary" - ] - - end_markers = [ - "equal opportunity", - "eeo", - "ul is an equal", - "application process", - "how to apply", - "privacy", - "cookies" - ] - - start_idx = -1 - for m in start_markers: - i = lt.find(m) - if i != -1: - start_idx = i + len(m) - break - - if start_idx == -1: - return None - - end_idx = len(full_text) - for m in end_markers: - i = lt.find(m, start_idx) - if i != -1: - end_idx = i - break - - description = full_text[start_idx:end_idx].strip() - return description - -job_desc = extract_job_description(text) - -print("\n--- CLEAN JOB DESCRIPTION ---\n") -print(job_desc if job_desc else "No job description found.") -print("\nLength:", len(job_desc)) diff --git a/try.py b/try.py deleted file mode 100644 index 7858fb39e..000000000 --- a/try.py +++ /dev/null @@ -1,31 +0,0 @@ -from playwright.sync_api import sync_playwright - -URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0" - -hits = [] - -def on_response(resp): - req = resp.request - rtype = req.resource_type - ct = resp.headers.get("content-type", "") - - # Job data is usually fetched via XHR/fetch and returns JSON - if rtype in ("xhr", "fetch") and "application/json" in ct: - hits.append(resp.url) - -with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - page = browser.new_page() - page.on("response", on_response) - - page.goto(URL, wait_until="domcontentloaded", timeout=60000) - - # Scroll a bit in case job details load on scroll - page.mouse.wheel(0, 2000) - page.wait_for_timeout(4000) - - browser.close() - -print("XHR/FETCH JSON URLs found:", len(hits)) -for u in hits[:50]: - print(u)