Merge pull request #2 from IshaAtteri/isha
structure change
This commit was merged in pull request #2.
This commit is contained in:
68
scrape.py
68
scrape.py
@@ -1,68 +0,0 @@
|
|||||||
from playwright.sync_api import sync_playwright
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
|
||||||
|
|
||||||
URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
|
|
||||||
|
|
||||||
with sync_playwright() as p:
|
|
||||||
browser = p.chromium.launch(headless=True)
|
|
||||||
page = browser.new_page()
|
|
||||||
page.goto(URL, wait_until="networkidle", timeout=60000)
|
|
||||||
|
|
||||||
# Wait a bit more for dynamic content (Workday can be heavy)
|
|
||||||
page.wait_for_timeout(2000)
|
|
||||||
|
|
||||||
html = page.content()
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "lxml")
|
|
||||||
text = soup.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
print("Rendered text length:", len(text))
|
|
||||||
print("Title:", soup.title.get_text(strip=True) if soup.title else None)
|
|
||||||
# print("\nSample:\n", text[:1500])
|
|
||||||
|
|
||||||
def extract_job_description(full_text):
|
|
||||||
lt = full_text.lower()
|
|
||||||
|
|
||||||
start_markers = [
|
|
||||||
"job description",
|
|
||||||
"about the role",
|
|
||||||
"job summary"
|
|
||||||
]
|
|
||||||
|
|
||||||
end_markers = [
|
|
||||||
"equal opportunity",
|
|
||||||
"eeo",
|
|
||||||
"ul is an equal",
|
|
||||||
"application process",
|
|
||||||
"how to apply",
|
|
||||||
"privacy",
|
|
||||||
"cookies"
|
|
||||||
]
|
|
||||||
|
|
||||||
start_idx = -1
|
|
||||||
for m in start_markers:
|
|
||||||
i = lt.find(m)
|
|
||||||
if i != -1:
|
|
||||||
start_idx = i + len(m)
|
|
||||||
break
|
|
||||||
|
|
||||||
if start_idx == -1:
|
|
||||||
return None
|
|
||||||
|
|
||||||
end_idx = len(full_text)
|
|
||||||
for m in end_markers:
|
|
||||||
i = lt.find(m, start_idx)
|
|
||||||
if i != -1:
|
|
||||||
end_idx = i
|
|
||||||
break
|
|
||||||
|
|
||||||
description = full_text[start_idx:end_idx].strip()
|
|
||||||
return description
|
|
||||||
|
|
||||||
job_desc = extract_job_description(text)
|
|
||||||
|
|
||||||
print("\n--- CLEAN JOB DESCRIPTION ---\n")
|
|
||||||
print(job_desc if job_desc else "No job description found.")
|
|
||||||
print("\nLength:", len(job_desc))
|
|
||||||
31
try.py
31
try.py
@@ -1,31 +0,0 @@
|
|||||||
from playwright.sync_api import sync_playwright
|
|
||||||
|
|
||||||
URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
|
|
||||||
|
|
||||||
hits = []
|
|
||||||
|
|
||||||
def on_response(resp):
|
|
||||||
req = resp.request
|
|
||||||
rtype = req.resource_type
|
|
||||||
ct = resp.headers.get("content-type", "")
|
|
||||||
|
|
||||||
# Job data is usually fetched via XHR/fetch and returns JSON
|
|
||||||
if rtype in ("xhr", "fetch") and "application/json" in ct:
|
|
||||||
hits.append(resp.url)
|
|
||||||
|
|
||||||
with sync_playwright() as p:
|
|
||||||
browser = p.chromium.launch(headless=True)
|
|
||||||
page = browser.new_page()
|
|
||||||
page.on("response", on_response)
|
|
||||||
|
|
||||||
page.goto(URL, wait_until="domcontentloaded", timeout=60000)
|
|
||||||
|
|
||||||
# Scroll a bit in case job details load on scroll
|
|
||||||
page.mouse.wheel(0, 2000)
|
|
||||||
page.wait_for_timeout(4000)
|
|
||||||
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
print("XHR/FETCH JSON URLs found:", len(hits))
|
|
||||||
for u in hits[:50]:
|
|
||||||
print(u)
|
|
||||||
Reference in New Issue
Block a user