diff --git a/new_met.py b/new_met.py new file mode 100644 index 000000000..0a09c6046 --- /dev/null +++ b/new_met.py @@ -0,0 +1,24 @@ +import requests + +url = "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1" +# url = 'https://motorolasolutions.wd5.myworkdayjobs.com/en-US/Careers/job/Schaumburg-IL/Machine-Learning-AI-Intern--Summer-2026-Internship-Program-_R59959?jr_id=69793e9952f3c27ec6459544' + +# url = 'https://mcafee.wd1.myworkdayjobs.com/External/job/US-California-San-Jose/Data-Science-Intern_JR0032270?jr_id=6979499688e2b47213bd807f' +page = requests.get(url) + +print(page.content) + +# data = page.json() +# print(data.keys()) + +# print('here') +# job = data["jobPostingInfo"] + +# print(job.keys()) + +# company = data["hiringOrganization"] + +# print(company.keys()) + +# comp = company["name"] +# print("Company:", comp) \ No newline at end of file diff --git a/part2.py b/part2.py new file mode 100644 index 000000000..392e90895 --- /dev/null +++ b/part2.py @@ -0,0 +1,27 @@ +import requests + +endpoints = [ + "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1", + "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/sidebar/Data-Science---Engineering-Intern_JR1410-1", +] + +headers = { + "User-Agent": "Mozilla/5.0", + "Accept": "application/json,text/plain,*/*", +} + +for url in endpoints: + r = requests.get(url, headers=headers, timeout=30) + print("\n===", url, "===") + print("Status:", r.status_code) + print("Content-Type:", r.headers.get("content-type")) + # Try JSON + try: + data = r.json() + print("JSON parsed ✅") + if isinstance(data, dict): + print("Top-level keys:", list(data.keys())[:40]) + else: + print("Type:", type(data), "Len:", len(data)) + except Exception: + print("Not JSON. Text preview:\n", r.text[:600]) diff --git a/scrape.py b/scrape.py new file mode 100644 index 000000000..ae3a0f003 --- /dev/null +++ b/scrape.py @@ -0,0 +1,68 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup +import re + +URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0" + +with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + page.goto(URL, wait_until="networkidle", timeout=60000) + + # Wait a bit more for dynamic content (Workday can be heavy) + page.wait_for_timeout(2000) + + html = page.content() + browser.close() + +soup = BeautifulSoup(html, "lxml") +text = soup.get_text(" ", strip=True) + +print("Rendered text length:", len(text)) +print("Title:", soup.title.get_text(strip=True) if soup.title else None) +# print("\nSample:\n", text[:1500]) + +def extract_job_description(full_text): + lt = full_text.lower() + + start_markers = [ + "job description", + "about the role", + "job summary" + ] + + end_markers = [ + "equal opportunity", + "eeo", + "ul is an equal", + "application process", + "how to apply", + "privacy", + "cookies" + ] + + start_idx = -1 + for m in start_markers: + i = lt.find(m) + if i != -1: + start_idx = i + len(m) + break + + if start_idx == -1: + return None + + end_idx = len(full_text) + for m in end_markers: + i = lt.find(m, start_idx) + if i != -1: + end_idx = i + break + + description = full_text[start_idx:end_idx].strip() + return description + +job_desc = extract_job_description(text) + +print("\n--- CLEAN JOB DESCRIPTION ---\n") +print(job_desc if job_desc else "No job description found.") +print("\nLength:", len(job_desc)) diff --git a/try.py b/try.py new file mode 100644 index 000000000..7858fb39e --- /dev/null +++ b/try.py @@ -0,0 +1,31 @@ +from playwright.sync_api import sync_playwright + +URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0" + +hits = [] + +def on_response(resp): + req = resp.request + rtype = req.resource_type + ct = resp.headers.get("content-type", "") + + # Job data is usually fetched via XHR/fetch and returns JSON + if rtype in ("xhr", "fetch") and "application/json" in ct: + hits.append(resp.url) + +with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + page.on("response", on_response) + + page.goto(URL, wait_until="domcontentloaded", timeout=60000) + + # Scroll a bit in case job details load on scroll + page.mouse.wheel(0, 2000) + page.wait_for_timeout(4000) + + browser.close() + +print("XHR/FETCH JSON URLs found:", len(hits)) +for u in hits[:50]: + print(u)