Isha #1
24
new_met.py
Normal file
24
new_met.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
url = "https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1"
|
||||||
|
# url = 'https://motorolasolutions.wd5.myworkdayjobs.com/en-US/Careers/job/Schaumburg-IL/Machine-Learning-AI-Intern--Summer-2026-Internship-Program-_R59959?jr_id=69793e9952f3c27ec6459544'
|
||||||
|
|
||||||
|
# url = 'https://mcafee.wd1.myworkdayjobs.com/External/job/US-California-San-Jose/Data-Science-Intern_JR0032270?jr_id=6979499688e2b47213bd807f'
|
||||||
|
page = requests.get(url)
|
||||||
|
|
||||||
|
print(page.content)
|
||||||
|
|
||||||
|
# data = page.json()
|
||||||
|
# print(data.keys())
|
||||||
|
|
||||||
|
# print('here')
|
||||||
|
# job = data["jobPostingInfo"]
|
||||||
|
|
||||||
|
# print(job.keys())
|
||||||
|
|
||||||
|
# company = data["hiringOrganization"]
|
||||||
|
|
||||||
|
# print(company.keys())
|
||||||
|
|
||||||
|
# comp = company["name"]
|
||||||
|
# print("Company:", comp)
|
||||||
27
part2.py
Normal file
27
part2.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
endpoints = [
|
||||||
|
"https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1",
|
||||||
|
"https://ulse.wd5.myworkdayjobs.com/wday/cxs/ulse/ulsecareers/sidebar/Data-Science---Engineering-Intern_JR1410-1",
|
||||||
|
]
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0",
|
||||||
|
"Accept": "application/json,text/plain,*/*",
|
||||||
|
}
|
||||||
|
|
||||||
|
for url in endpoints:
|
||||||
|
r = requests.get(url, headers=headers, timeout=30)
|
||||||
|
print("\n===", url, "===")
|
||||||
|
print("Status:", r.status_code)
|
||||||
|
print("Content-Type:", r.headers.get("content-type"))
|
||||||
|
# Try JSON
|
||||||
|
try:
|
||||||
|
data = r.json()
|
||||||
|
print("JSON parsed ✅")
|
||||||
|
if isinstance(data, dict):
|
||||||
|
print("Top-level keys:", list(data.keys())[:40])
|
||||||
|
else:
|
||||||
|
print("Type:", type(data), "Len:", len(data))
|
||||||
|
except Exception:
|
||||||
|
print("Not JSON. Text preview:\n", r.text[:600])
|
||||||
68
scrape.py
Normal file
68
scrape.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
|
||||||
|
URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
page = browser.new_page()
|
||||||
|
page.goto(URL, wait_until="networkidle", timeout=60000)
|
||||||
|
|
||||||
|
# Wait a bit more for dynamic content (Workday can be heavy)
|
||||||
|
page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
html = page.content()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
print("Rendered text length:", len(text))
|
||||||
|
print("Title:", soup.title.get_text(strip=True) if soup.title else None)
|
||||||
|
# print("\nSample:\n", text[:1500])
|
||||||
|
|
||||||
|
def extract_job_description(full_text):
|
||||||
|
lt = full_text.lower()
|
||||||
|
|
||||||
|
start_markers = [
|
||||||
|
"job description",
|
||||||
|
"about the role",
|
||||||
|
"job summary"
|
||||||
|
]
|
||||||
|
|
||||||
|
end_markers = [
|
||||||
|
"equal opportunity",
|
||||||
|
"eeo",
|
||||||
|
"ul is an equal",
|
||||||
|
"application process",
|
||||||
|
"how to apply",
|
||||||
|
"privacy",
|
||||||
|
"cookies"
|
||||||
|
]
|
||||||
|
|
||||||
|
start_idx = -1
|
||||||
|
for m in start_markers:
|
||||||
|
i = lt.find(m)
|
||||||
|
if i != -1:
|
||||||
|
start_idx = i + len(m)
|
||||||
|
break
|
||||||
|
|
||||||
|
if start_idx == -1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
end_idx = len(full_text)
|
||||||
|
for m in end_markers:
|
||||||
|
i = lt.find(m, start_idx)
|
||||||
|
if i != -1:
|
||||||
|
end_idx = i
|
||||||
|
break
|
||||||
|
|
||||||
|
description = full_text[start_idx:end_idx].strip()
|
||||||
|
return description
|
||||||
|
|
||||||
|
job_desc = extract_job_description(text)
|
||||||
|
|
||||||
|
print("\n--- CLEAN JOB DESCRIPTION ---\n")
|
||||||
|
print(job_desc if job_desc else "No job description found.")
|
||||||
|
print("\nLength:", len(job_desc))
|
||||||
30
scripts/scrape.py
Normal file
30
scripts/scrape.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
url = "https://en.wikipedia.org/w/api.php"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "CSE881-MovieProject/1.0 (ishaa@msu.edu)"
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"prop": "extracts",
|
||||||
|
"titles": "Interstellar",
|
||||||
|
"explaintext": True,
|
||||||
|
"redirects": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
|
||||||
|
print("Status:", response.status_code)
|
||||||
|
print("Content-Type:", response.headers.get("content-type"))
|
||||||
|
print("First 200 chars:\n", response.text[:200])
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
pages = data["query"]["pages"]
|
||||||
|
page = next(iter(pages.values()))
|
||||||
|
|
||||||
|
print("\nTitle:", page["title"])
|
||||||
|
print("\nPreview:\n", page["extract"][:500])
|
||||||
31
try.py
Normal file
31
try.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
URL = "https://ulse.wd5.myworkdayjobs.com/ulsecareers/job/Evanston-IL/Data-Science---Engineering-Intern_JR1410-1?jr_id=6979179b39f7f96cc6d173c0"
|
||||||
|
|
||||||
|
hits = []
|
||||||
|
|
||||||
|
def on_response(resp):
|
||||||
|
req = resp.request
|
||||||
|
rtype = req.resource_type
|
||||||
|
ct = resp.headers.get("content-type", "")
|
||||||
|
|
||||||
|
# Job data is usually fetched via XHR/fetch and returns JSON
|
||||||
|
if rtype in ("xhr", "fetch") and "application/json" in ct:
|
||||||
|
hits.append(resp.url)
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
page = browser.new_page()
|
||||||
|
page.on("response", on_response)
|
||||||
|
|
||||||
|
page.goto(URL, wait_until="domcontentloaded", timeout=60000)
|
||||||
|
|
||||||
|
# Scroll a bit in case job details load on scroll
|
||||||
|
page.mouse.wheel(0, 2000)
|
||||||
|
page.wait_for_timeout(4000)
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
print("XHR/FETCH JSON URLs found:", len(hits))
|
||||||
|
for u in hits[:50]:
|
||||||
|
print(u)
|
||||||
Reference in New Issue
Block a user