Merge pull request #1 from IshaAtteri/isha has the code

Isha
2026-02-11 17:54:04 -05:00
parent 369f5ced89 0cc571727b
commit ed2e20f8cd
5 changed files with 180 additions and 0 deletions
--- a/new_met.py
+++ b/new_met.py
@@ -0,0 +1,24 @@
+import requests
+
+url = "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1"
+# url = 'https://motorolasolutions.wd5.myworkdayjobs.com/en-US/Careers/job/Schaumburg-IL/Machine-Learning-AI-Intern--Summer-2026-Internship-Program-_R59959?jr_id=69793e9952f3c27ec6459544'
+
+# url = 'https://mcafee.wd1.myworkdayjobs.com/External/job/US-California-San-Jose/Data-Science-Intern_JR0032270?jr_id=6979499688e2b47213bd807f'
+page = requests.get(url)
+
+print(page.content)
+
+# data = page.json()
+# print(data.keys())
+
+# print('here')
+# job = data["jobPostingInfo"]
+
+# print(job.keys())
+
+# company = data["hiringOrganization"]
+
+# print(company.keys())
+
+# comp = company["name"]
+# print("Company:", comp)
--- a/part2.py
+++ b/part2.py
@@ -0,0 +1,27 @@
+import requests
+
+endpoints = [
+    "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1",
+    "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/sidebar/Data-Science---Engineering-Intern_JR1410-1",
+]
+
+headers = {
+    "User-Agent": "Mozilla/5.0",
+    "Accept": "application/json,text/plain,*/*",
+}
+
+for url in endpoints:
+    r = requests.get(url, headers=headers, timeout=30)
+    print("\n===", url, "===")
+    print("Status:", r.status_code)
+    print("Content-Type:", r.headers.get("content-type"))
+    # Try JSON
+    try:
+        data = r.json()
+        print("JSON parsed ✅")
+        if isinstance(data, dict):
+            print("Top-level keys:", list(data.keys())[:40])
+        else:
+            print("Type:", type(data), "Len:", len(data))
+    except Exception:
+        print("Not JSON. Text preview:\n", r.text[:600])
--- a/scrape.py
+++ b/scrape.py
@@ -0,0 +1,68 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+import re
+
+URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
+
+with sync_playwright() as p:
+    browser = p.chromium.launch(headless=True)
+    page = browser.new_page()
+    page.goto(URL, wait_until="networkidle", timeout=60000)
+
+    # Wait a bit more for dynamic content (Workday can be heavy)
+    page.wait_for_timeout(2000)
+
+    html = page.content()
+    browser.close()
+
+soup = BeautifulSoup(html, "lxml")
+text = soup.get_text(" ", strip=True)
+
+print("Rendered text length:", len(text))
+print("Title:", soup.title.get_text(strip=True) if soup.title else None)
+# print("\nSample:\n", text[:1500])
+
+def extract_job_description(full_text):
+    lt = full_text.lower()
+
+    start_markers = [
+        "job description",
+        "about the role",
+        "job summary"
+    ]
+
+    end_markers = [
+        "equal opportunity",
+        "eeo",
+        "ul is an equal",
+        "application process",
+        "how to apply",
+        "privacy",
+        "cookies"
+    ]
+
+    start_idx = -1
+    for m in start_markers:
+        i = lt.find(m)
+        if i != -1:
+            start_idx = i + len(m)
+            break
+
+    if start_idx == -1:
+        return None
+
+    end_idx = len(full_text)
+    for m in end_markers:
+        i = lt.find(m, start_idx)
+        if i != -1:
+            end_idx = i
+            break
+
+    description = full_text[start_idx:end_idx].strip()
+    return description
+
+job_desc = extract_job_description(text)
+
+print("\n--- CLEAN JOB DESCRIPTION ---\n")
+print(job_desc if job_desc else "No job description found.")
+print("\nLength:", len(job_desc))
--- a/scripts/scrape.py
+++ b/scripts/scrape.py
@@ -0,0 +1,30 @@
+import requests
+
+url = "https://en.wikipedia.org/w/api.php"
+
+headers = {
+    "User-Agent": "CSE881-MovieProject/1.0 (ishaa@msu.edu)"
+}
+
+params = {
+    "action": "query",
+    "format": "json",
+    "prop": "extracts",
+    "titles": "Interstellar",
+    "explaintext": True,
+    "redirects": 1
+}
+
+response = requests.get(url, headers=headers, params=params)
+
+print("Status:", response.status_code)
+print("Content-Type:", response.headers.get("content-type"))
+print("First 200 chars:\n", response.text[:200])
+
+data = response.json()
+
+pages = data["query"]["pages"]
+page = next(iter(pages.values()))
+
+print("\nTitle:", page["title"])
+print("\nPreview:\n", page["extract"][:500])
--- a/try.py
+++ b/try.py
@@ -0,0 +1,31 @@
+from playwright.sync_api import sync_playwright
+
+URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
+
+hits = []
+
+def on_response(resp):
+    req = resp.request
+    rtype = req.resource_type
+    ct = resp.headers.get("content-type", "")
+
+    # Job data is usually fetched via XHR/fetch and returns JSON
+    if rtype in ("xhr", "fetch") and "application/json" in ct:
+        hits.append(resp.url)
+
+with sync_playwright() as p:
+    browser = p.chromium.launch(headless=True)
+    page = browser.new_page()
+    page.on("response", on_response)
+
+    page.goto(URL, wait_until="domcontentloaded", timeout=60000)
+
+    # Scroll a bit in case job details load on scroll
+    page.mouse.wheel(0, 2000)
+    page.wait_for_timeout(4000)
+
+    browser.close()
+
+print("XHR/FETCH JSON URLs found:", len(hits))
+for u in hits[:50]:
+    print(u)