datamining_881/scrape.py

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import re

URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto(URL, wait_until="networkidle", timeout=60000)

    # Wait a bit more for dynamic content (Workday can be heavy)
    page.wait_for_timeout(2000)

    html = page.content()
    browser.close()

soup = BeautifulSoup(html, "lxml")
text = soup.get_text(" ", strip=True)

print("Rendered text length:", len(text))
print("Title:", soup.title.get_text(strip=True) if soup.title else None)
# print("\nSample:\n", text[:1500])

def extract_job_description(full_text):
    lt = full_text.lower()

    start_markers = [
        "job description",
        "about the role",
        "job summary"
    ]

    end_markers = [
        "equal opportunity",
        "eeo",
        "ul is an equal",
        "application process",
        "how to apply",
        "privacy",
        "cookies"
    ]

    start_idx = -1
    for m in start_markers:
        i = lt.find(m)
        if i != -1:
            start_idx = i + len(m)
            break

    if start_idx == -1:
        return None

    end_idx = len(full_text)
    for m in end_markers:
        i = lt.find(m, start_idx)
        if i != -1:
            end_idx = i
            break

    description = full_text[start_idx:end_idx].strip()
    return description

job_desc = extract_job_description(text)

print("\n--- CLEAN JOB DESCRIPTION ---\n")
print(job_desc if job_desc else "No job description found.")
print("\nLength:", len(job_desc))