diff --git a/Dockerfile b/Dockerfile index 048ca07..1a2f714 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,15 @@ # Stage 1: Build Stage -FROM python:3.9-slim AS builder +FROM python:3.9.18-slim AS builder -# Set environment variables to prevent .pyc files and enable unbuffered logging +# Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 -# Install build dependencies +# Install build dependencies including PyMuPDF requirements RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ + libmupdf-dev \ + python3-dev \ && rm -rf /var/lib/apt/lists/* # Set working directory @@ -15,17 +17,28 @@ WORKDIR /jobflow-api # Copy dependency file and install dependencies COPY requirements.txt . -RUN pip install --upgrade pip && pip install -r requirements.txt +RUN pip install --upgrade pip && \ + pip install -r requirements.txt && \ + pip install gunicorn # Add explicit gunicorn installation -# Generate Prisma client +# Generate Prisma client (Fixed command) COPY db/ ./db/ -RUN pip install prisma && python -c "from prisma import generate_client; generate_client()" +COPY prisma/ ./prisma/ +RUN pip install prisma && \ + prisma generate --schema=prisma/schema.prisma # Copy the rest of the application code COPY . . # Stage 2: Final Image -FROM python:3.9-slim +FROM python:3.9.18-slim + +# Install runtime dependencies for PyMuPDF +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libmupdf-dev \ + && rm -rf /var/lib/apt/lists/* # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 \ @@ -45,13 +58,10 @@ WORKDIR /jobflow-api # Copy only the installed packages and application from the builder stage COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages COPY --from=builder /jobflow-api /jobflow-api +# Copy gunicorn binary specifically COPY --from=builder /usr/local/bin/gunicorn /usr/local/bin/ # Expose the port your app runs on EXPOSE 5001 - -# Add a health check endpoint -RUN echo 'from flask import Blueprint\nhealth_blueprint = Blueprint("health", __name__)\n@health_blueprint.route("/health")\ndef health_check():\n return {"status": "healthy"}, 200' > /jobflow-api/controllers/health.py - # Use Gunicorn to run the app CMD ["gunicorn", "--workers", "4", "--timeout", "300", "--bind", "0.0.0.0:5001", "app:app"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 34c54c1..3890ec9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: web: build: @@ -11,9 +9,14 @@ services: - DATABASE_URL=${DATABASE_URL} - SCRAPER_API=${SCRAPER_API} - JWT_SECRET=${JWT_SECRET} + - GOOGLE_API_KEY=${GOOGLE_API_KEY} + - JOBFLOW_AWS_REGION=${JOBFLOW_AWS_REGION} + - JOBFLOW_AWS_ACCESS_KEY=${JOBFLOW_AWS_ACCESS_KEY} + - JOBFLOW_AWS_SECRET_KEY=${JOBFLOW_AWS_SECRET_KEY} + - AWS_BUCKET_NAME=${AWS_BUCKET_NAME} restart: always healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:5001/health"] + test: ["CMD", "curl", "-f", "http://localhost:5001/api"] interval: 30s timeout: 10s retries: 3 diff --git a/function/__init__.py b/function/__init__.py index af03b0d..531f67c 100644 --- a/function/__init__.py +++ b/function/__init__.py @@ -1,4 +1,4 @@ # from .insert_job import insert_job -# from .crawler.crawler import scrapejobsdata, fetch_job_details +# from .crawler.crawler import scrapejobsdata, fetch_job_details_linkedin -# __all__ = ['scrapejobsdata', 'fetch_job_details'] \ No newline at end of file +# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin'] \ No newline at end of file diff --git a/function/aiHelper.py b/function/aiHelper.py index 4b89808..256b4c5 100644 --- a/function/aiHelper.py +++ b/function/aiHelper.py @@ -14,8 +14,10 @@ - job_salary (string or null): Full salary text (e.g., 12L - 15L annually) or null if not found. - experience_min (int or null): Minimum years of experience (e.g., 2) or null if not found. - experience_max (int or null): Maximum years of experience (e.g., 5) or null if not found. -- experience (string or null): Experience level (e.g., "mid-level", "Trainee", "new-grad", "fresher", "Experienced") understand what it could be based on the min & max experience and based on JD & role. If its hard or less precise return null. +- experience (string or null): Experience level (e.g., "Internship", "Entry-level", "Mid-level", "Experienced") understand what it could be based on the experience_min and experience_max and based on JD & role. If its hard or less precise return null. - skills_required (list of strings or null): List of skills (e.g., ["Python", "SQL"]) or null if not found. +- job_type (string or null): Type of job (e.g., "Full-time", "Part-time", "Contract") or null if not found. +- end_date (string or null): End date of the job (e.g., "2024-12-31") or null if not found. Return the result as a JSON object. If a field cannot be determined, use null. Be precise and avoid guessing. """ @@ -50,4 +52,6 @@ def extract_job_details_with_AI(job_description): "experience_max": None, "experience": None, "skills_required": None, + "job_type": None, + "end_date": None, } \ No newline at end of file diff --git a/function/crawler/__init__.py b/function/crawler/__init__.py index 5a001f3..0ec0aa2 100644 --- a/function/crawler/__init__.py +++ b/function/crawler/__init__.py @@ -1,5 +1,5 @@ # from .crawler import scrapejobsdata -# from .crawler import fetch_job_details +# from .crawler import fetch_job_details_linkedin # from .crawler import createFile -# __all__ = ['scrapejobsdata', 'fetch_job_details', 'createFile'] \ No newline at end of file +# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin', 'createFile'] \ No newline at end of file diff --git a/function/crawler/crawler.py b/function/crawler/crawler.py index fdd6348..3b7c51c 100644 --- a/function/crawler/crawler.py +++ b/function/crawler/crawler.py @@ -28,18 +28,26 @@ async def scrape_workday_jobs(): await scrape_workday() +async def scrape_linkedin_jobs(): + await scrape_linkedin() + +async def scrape_glassdoor_jobs(): + await scrape_glassdoor() + +async def scrape_simplyhired_jobs(): + await scrape_simplyhired() + async def scrapejobsdata(searchKeyword): searchKeyword = urllib.parse.quote(searchKeyword) # Encodes spaces as %20 jobPortals = { - "glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc", - "linkedin": f"https://www.linkedin.com/jobs/search/?f_TPR=r2000&geoId=102713980&sortBy=DD", - "simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000", # Use only important keywords & scrape - "indeed": f"https://in.indeed.com/jobs?q={searchKeyword}", + # "glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc", + # "simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000", # Use only important keywords & scrape + # "indeed": f"https://in.indeed.com/jobs?q={searchKeyword}", # "ycombinator": f"https://www.workatastartup.com/companies?query={searchKeyword}&sortBy=keyword", # Need signin # "internshala": f"https://internshala.com/jobs/salary-7/", # Not in sorted order or filter by date. - "upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}", - "freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}", + # "upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}", + # "freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}", # "naukri": f"https://www.naukri.com/jobs-in-india?jobAge=1", # Use all filters to limit quality jobs # "foundit": f"https://www.foundit.in/srp/results?query={searchKeyword}", # Proxy issue } @@ -63,10 +71,10 @@ async def scrapejobsdata(searchKeyword): if portal == 'linkedin': print(portal) - await scrape_linkedin(soup) + # await scrape_linkedin(soup) - elif portal == 'glassdoor': - await scrape_glassdoor(soup) + # elif portal == 'glassdoor': + # await scrape_glassdoor(soup) elif portal == 'indeed': await scrape_indeed(soup) diff --git a/function/crawler/job_portals/foundit.py b/function/crawler/job_portals/foundit.py index 010a004..530db49 100644 --- a/function/crawler/job_portals/foundit.py +++ b/function/crawler/job_portals/foundit.py @@ -1,4 +1,4 @@ -from function.utils import createFile, fetch_job_details +from function.utils import createFile, fetch_job_details_linkedin from function.insert_job import insert_job async def scrape_foundit(soup): diff --git a/function/crawler/job_portals/freelancer.py b/function/crawler/job_portals/freelancer.py index 7ff00c6..81510e0 100644 --- a/function/crawler/job_portals/freelancer.py +++ b/function/crawler/job_portals/freelancer.py @@ -1,4 +1,4 @@ -from function.utils import createFile, fetch_job_details, extract_salary +from function.utils import createFile, fetch_job_details_linkedin, extract_salary from function.insert_job import insert_job from function.utils import extract_salary diff --git a/function/crawler/job_portals/glassdoor.py b/function/crawler/job_portals/glassdoor.py index 0f1f49e..da11d01 100644 --- a/function/crawler/job_portals/glassdoor.py +++ b/function/crawler/job_portals/glassdoor.py @@ -1,19 +1,64 @@ -from function.utils import createFile, fetch_job_details, extract_salary +from function.utils import createFile, fetch_job_details_linkedin, extract_salary from function.insert_job import insert_job from function.utils import extract_salary +from dotenv import load_dotenv +import os +import logging +import requests +from bs4 import BeautifulSoup +import urllib +import time +from db.prisma import db +from function.aiHelper import extract_job_details_with_AI -async def scrape_glassdoor(soup): + +logging.basicConfig(filename= 'log.txt', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +load_dotenv() + +scraperapi_key = os.getenv('SCRAPER_API') + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' +} + +async def scrape_glassdoor_jobpage(soup): portal = 'glassdoor' - print("inside glassdoor scrape") + print("inside glassdoor scrape \n") job_list = soup.find('ul', class_='JobsList_jobsList__lqjTr') + jobs_count = 0 if job_list: jobs = job_list.find_all('li') with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file: for job in jobs: title_element = job.find('a', class_='JobCard_jobTitle__GLyJ1') title = title_element.text.strip() if title_element else None + if title is None: + continue company_name_element = job.find('span', class_='EmployerProfile_compactEmployerName__9MGcV') company_name = company_name_element.text.strip() if company_name_element else None + + if company_name: + normalized_company_name = company_name.strip().lower() + company = await db.company.find_unique(where={'company_name': normalized_company_name}) + + if company and title: + existing_job = await db.job.find_first(where={ + "title": title, + "companyId": company.id, + "status": "active" + }) + + if existing_job: + logger.info(f"Job '{title}' already exists for company '{company_name}'") + continue + + if not company_name: + company_name = "Unknown Company" + + logger.info(f"Title: {title}, Company Name: {company_name}") + jobs_count += 1 job_link = title_element['href'] if title_element else None job_location_element = job.find('div', class_='JobCard_location__Ds1fM') job_location = job_location_element.text.strip() if job_location_element else None @@ -27,19 +72,20 @@ async def scrape_glassdoor(soup): company_logo = logo_url - job_description_section = job.find('div', class_='JobCard_jobDescriptionSnippet__l1tnl') - job_description = None - skills = [] - if job_description_section: - description_divs = job_description_section.find_all('div') - if len(description_divs) > 0: - job_description = description_divs[0].text.strip() - if len(description_divs) > 1: - skills_section = description_divs[1] - if skills_section: - skills_text = skills_section.text.strip().replace('Skills:', '').strip() - skills = [skill.strip() for skill in skills_text.split(',')] + time.sleep(2) + job_description = fetch_job_details_glassdoor(job_link) + job_description = job_description.get_text(strip=True) + + jd_extracted = extract_job_details_with_AI(job_description) + skills = jd_extracted['skills_required'] + experience_min = jd_extracted['experience_min'] + experience_max = jd_extracted['experience_max'] + experience = jd_extracted['experience'] + end_date = jd_extracted['end_date'] + job_type = jd_extracted['job_type'] + + salary_min = None salary_max = None @@ -57,12 +103,88 @@ async def scrape_glassdoor(soup): "salary_min": salary_min, "salary_max": salary_max, "skills_required": skills, - "experience_level": None, + "experience_level": experience, + "experience_min": experience_min, + "experience_max": experience_max, + "end_date": end_date, + "job_type": job_type, "job_description": job_description, "source": portal } try: await insert_job(job_info) - createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None) + # createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None) except Exception as e: print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}") + return jobs_count + +def fetch_job_details_glassdoor(job_url): + try: + proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={job_url}" + response = requests.get(proxy_url, headers=headers) + response.raise_for_status() + job_page = BeautifulSoup(response.text, 'html.parser') + + job_description = job_page.find('div', class_=lambda x: x and 'JobDetails_jobDescription__uW_fK' in x) + return job_description + + except requests.exceptions.RequestException as e: + print(f"Failed to fetch salary from {job_url}: {e}") + return None + + + +searchKeywords = [ + "software developer", + "data scientist", + "full stack developer", + "python developer", + "project manager", + "machine learning engineer", + "data analyst", + "cloud engineer", + "frontend developer", + "backend developer", + "product manager", + "devops engineer", + "HR manager", + "digital marketing", + "business analyst", + "sales manager", + "AI research scientist", + "web developer", + "graphic designer", + "react developer" +] + +async def scrape_glassdoor(): + total_jobs = 0 + + for keyword in searchKeywords: + keyword_hyphenated = keyword.replace(" ", "-") + ko_end = 6 + len(keyword_hyphenated) + 1 + base_url = "https://www.glassdoor.co.in/Job/india-{keyword}-jobs-SRCH_IL.0,5_IN115_KO6,{ko_end}.htm" + seniority_types = ["entrylevel", "internship"] + currentJobs = 0 + for seniority in seniority_types: + params = { + "maxSalary": 6000000, + "minSalary": 10000, + "fromAge": 1, + "sortBy": "date_desc", + "seniorityType": seniority + } + url = base_url.format(keyword=keyword_hyphenated, ko_end=ko_end) + url += "?" + "&".join(f"{k}={v}" for k, v in params.items()) + + proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}" + response = requests.get(proxy_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + print(f"URL for {seniority}: {url}") + time.sleep(2) + currentJobs += await scrape_glassdoor_jobpage(soup) + total_jobs += currentJobs + print(f"Jobs fetched: {currentJobs} for {keyword} jobs") + + print(f"Total jobs fetched: {total_jobs} on glassdoor") diff --git a/function/crawler/job_portals/indeed.py b/function/crawler/job_portals/indeed.py index e15f0ac..8bd8609 100644 --- a/function/crawler/job_portals/indeed.py +++ b/function/crawler/job_portals/indeed.py @@ -1,4 +1,4 @@ -from function.utils import createFile, fetch_job_details +from function.utils import createFile, fetch_job_details_linkedin from function.insert_job import insert_job async def scrape_indeed(soup): portal = 'indeed' diff --git a/function/crawler/job_portals/internshala.py b/function/crawler/job_portals/internshala.py index fd94224..cad8942 100644 --- a/function/crawler/job_portals/internshala.py +++ b/function/crawler/job_portals/internshala.py @@ -1,5 +1,5 @@ -from function.utils import createFile, fetch_job_details +from function.utils import createFile, fetch_job_details_linkedin from function.insert_job import insert_job async def scrape_internshala(soup): portal = 'internshala' diff --git a/function/crawler/job_portals/linkedin.py b/function/crawler/job_portals/linkedin.py index 06366cc..1044d8f 100644 --- a/function/crawler/job_portals/linkedin.py +++ b/function/crawler/job_portals/linkedin.py @@ -1,52 +1,172 @@ -from function.utils import createFile, fetch_job_details, extract_salary +from function.utils import createFile, fetch_job_details_linkedin, extract_salary from function.insert_job import insert_job from function.utils import extract_salary +import re +from function.aiHelper import extract_job_details_with_AI +import requests +from bs4 import BeautifulSoup +import urllib +import time +import logging + +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger(__name__) + + +def extract_job_id(job_link): + match = re.search(r'-(\d+)\?', job_link) + return match.group(1) if match else None + +def fetch_jobs_from_api(base_url, headers, start, count): + """Fetch jobs from LinkedIn API with pagination.""" + params = { + 'f_TPR': 'r3000', + 'geoId': '102713980', + 'sortBy': 'DD', + 'f_E': '1,2,3', + 'start': start, + } + + finalUrl = base_url + "?" + urllib.parse.urlencode(params) + response = requests.get(base_url, headers=headers, params=params) + if response.status_code == 200: + return BeautifulSoup(response.text, 'html.parser') + else: + print(f"Failed to fetch jobs at start={start}: {response.status_code}, error: {response}") + return None + +async def scrape_linkedin_jobs(soup, portal='linkedin', portal_logo='https://www.linkedin.com/favicon.ico'): + """Process job listings from BeautifulSoup object.""" + jobs = soup.find_all('li') + job_count = 0 + + for job in jobs: + title = job.find('h3', class_='base-search-card__title') + company_name = job.find('h4', class_='base-search-card__subtitle') + if title and company_name: + job_link = job.find('a', class_='base-card__full-link') + job_link = job_link['href'].strip() if job_link else None + job_id = extract_job_id(job_link) + + + job_location = job.find('span', class_='job-search-card__location') + if job_link: + job_details = fetch_job_details_linkedin(job_link) + if job_details: + job_salary, experience_level, job_type, job_description, company_legit_check = job_details + if company_legit_check is None: + logger.info(f"Skipping this job: {title.text.strip()}, Company: {company_name.text.strip()}") + continue + else: + logger.info(f"No job details found for: {title.text.strip()}, Company: {company_name.text.strip()}") + continue + else: + logger.info(f"Skipping job with no link: {title.text.strip()}, Company: {company_name.text.strip()}") + continue + job_count += 1 + + jd_extracted = extract_job_details_with_AI(job_description) + company_logo_element = job.find('div', class_='search-entity-media') + company_logo=None + if company_logo_element: + img_tag= company_logo_element.find('img') + if img_tag: + company_logo = img_tag['data-delayed-url'] + + salary_min = None + salary_max = None + end_date = None + + if job_salary: + salary_min, salary_max = extract_salary(job_salary) + + if jd_extracted: + if salary_min is None: + salary_min = jd_extracted.get("salary_min") + if salary_max is None: + salary_max = jd_extracted.get("salary_max") + if job_salary is None: + job_salary = jd_extracted.get("job_salary") + if jd_extracted.get("end_date"): + end_date = jd_extracted.get("end_date") + + if title and company_name and job_link and job_location: + job_info = { + "title": title.text.strip(), + "company_name": company_name.text.strip(), + "company_logo": company_logo, + "job_link": job_link, + "job_id": job_id, + "job_location": job_location.text.strip(), + "job_type": job_type, + "job_salary": job_salary, + "salary_min": salary_min , + "salary_max": salary_max, + "experience_min": jd_extracted.get("experience_min"), + "experience_max": jd_extracted.get("experience_max"), + "experience": experience_level, + "skills_required": jd_extracted.get("skills_required"), + "job_description": job_description, + "end_date": end_date, + "source": portal, + "source_logo": portal_logo + } + try: + await insert_job(job_info) + # createFile(file, title.text.strip(), company_name.text.strip(), job_link, job_location.text.strip(), None, None, experience_level, job_salary, portal, job_type) + except Exception as e: + print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}") + return job_count + +async def scrape_linkedin(): + base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search" + page_url = "https://www.linkedin.com/jobs/search/?keywords=&location=India&geoId=102713980&f_TPR=r3000&f_E=1" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + # 'Cookie': 'YOUR_LINKEDIN_COOKIE_HERE' + } + + start = 25 + count = 25 + total_jobs = 0 + requestCount = 0 + + print("Starting LinkedIn job scraping...") + + # Initial Scraping + response = requests.get(page_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + total_jobs += await scrape_linkedin_jobs(soup) + requestCount += 1 + + + print(f"Fetched {total_jobs} jobs, total: {total_jobs}") + while True: + soup = fetch_jobs_from_api(base_url, headers, start, count) + if not soup: + break + + jobs_fetched = await scrape_linkedin_jobs(soup) + total_jobs += jobs_fetched + + print(f"Fetched {jobs_fetched} jobs, total: {total_jobs}") + if jobs_fetched == 0: + break + + start += count + requestCount += 1 + if requestCount % 18 == 0: + print("Fetched {jobs_fetched} jobs, total: {total_jobs}", "Wait for 10 seconds...") + time.sleep(65) # Small delay to avoid rate limiting + + print(f"Scraping complete. Total jobs fetched: {total_jobs}") + + + -async def scrape_linkedin(soup): - portal = 'linkedin' - job_list = soup.find('ul', class_='jobs-search__results-list') - if job_list: - jobs = job_list.find_all('li') - with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file: - for job in jobs: - title = job.find('h3', class_='base-search-card__title') - company_name = job.find('h4', class_='base-search-card__subtitle') - job_link = job.find('a', class_='base-card__full-link') - job_location = job.find('span', class_='job-search-card__location') - job_salary, experience_level, job_type = fetch_job_details(job_link['href'].strip()) if job_link else None - company_logo_element = job.find('div', class_='search-entity-media') - company_logo=None - if company_logo_element: - img_tag= company_logo_element.find('img') - if img_tag: - company_logo = img_tag['data-delayed-url'] - - salary_min = None - salary_max = None - - if job_salary: - salary_min, salary_max = extract_salary(job_salary) - - if title and company_name and job_link and job_location: - job_info = { - "title": title.text.strip(), - "company_name": company_name.text.strip(), - "company_logo": company_logo, - "job_link": job_link['href'].strip(), - "job_location": job_location.text.strip(), - "job_salary": job_salary, - "salary_min": salary_min, - "salary_max": salary_max, - "experience_level": experience_level, - "job_type": job_type, - "source": portal, - } - try: - await insert_job(job_info) - createFile(file, title.text.strip(), company_name.text.strip(), job_link['href'].strip(), job_location.text.strip(), None, None, experience_level, job_salary, portal, job_type) - except Exception as e: - print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}") - async def scrape_linkedin_jobpage(soup, job_link): portal = 'linkedin' diff --git a/function/crawler/job_portals/simplyhired.py b/function/crawler/job_portals/simplyhired.py index 985e64e..8cf8fc8 100644 --- a/function/crawler/job_portals/simplyhired.py +++ b/function/crawler/job_portals/simplyhired.py @@ -1,43 +1,231 @@ -from function.utils import createFile, fetch_job_details, extract_salary +from function.utils import createFile from function.insert_job import insert_job from function.utils import extract_salary +from dotenv import load_dotenv +import os +import logging +import requests +from bs4 import BeautifulSoup +import urllib +import time +from db.prisma import db +from function.aiHelper import extract_job_details_with_AI -async def scrape_simplyhired(soup): - portal = 'simplyhired' +from datetime import timedelta, datetime + + +logging.basicConfig(filename= 'log.txt', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +load_dotenv() +scraperapi_key = os.getenv('SCRAPER_API') + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' +} + +async def scrape_simplyhired_jobpage(soup): + portal = 'simplyhired' + print("inside simplyhired scrape \n") + jobs_count = 0 job_list = soup.find('ul', id='job-list') if job_list: jobs = job_list.find_all('li') - with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file: - for job in jobs: - title_element = job.find('a', class_='chakra-button css-1djbb1k') - company_name_element = job.find('span', class_='css-lvyu5j').find('span') - job_link_element = title_element['href'] - job_location_element = job.find('span', class_='css-1t92pv') - job_salary_element = job.find('p', class_='chakra-text css-1g1y608') - - - title = title_element.text.strip() if title_element else "N/A" - company_name = company_name_element.text.strip() if company_name_element else "N/A" - job_link = "https://www.simplyhired.co.in" + job_link_element if job_link_element else "N/A" - job_location = job_location_element.text.strip() if job_location_element else "N/A" - job_salary = job_salary_element.text.strip() if job_salary_element else "N/A" - - salary_min = None - salary_max = None - - if job_salary: - salary_min, salary_max = extract_salary(job_salary) - - job_info = { - "title": title, - "company_name": company_name, - "job_link": job_link, - "job_location": job_location, - "job_salary": job_salary, - "salary_min": salary_min, - "salary_max": salary_max, - "source": portal - } - await insert_job(job_info) - createFile(file, title, company_name, job_link, job_location, None, None, None, job_salary, portal, None) \ No newline at end of file + # with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file: + for job in jobs: + # print(job) + title_element = job.find('a', class_='chakra-button css-1djbb1k') + title = title_element.text.strip() if title_element else None + if title is None: + continue + company_name_element = job.find('span', class_='css-lvyu5j').find('span') + company_name = company_name_element.text.strip() if company_name_element else None + + if company_name: + normalized_company_name = company_name.strip().lower() + company = await db.company.find_unique(where={'company_name': normalized_company_name}) + + if company and title: + existing_job = await db.job.find_first(where={ + "title": title, + "companyId": company.id, + "status": "active" + }) + + if existing_job: + logger.info(f"Job '{title}' already exists for company '{company_name}'") + continue + if not company_name: + company_name = "Unknown Company" + + logger.info(f"Title: {title}, Company Name: {company_name}") + jobs_count += 1 + + job_link_element = title_element['href'] + job_location_element = job.find('span', class_='css-1t92pv') + job_salary_element = job.find('p', class_='chakra-text css-1g1y608') + job_link = "https://www.simplyhired.co.in" + job_link_element if job_link_element else None + job_location = job_location_element.text.strip() if job_location_element else None + job_salary = job_salary_element.text.strip() if job_salary_element else None + + salary_min = None + salary_max = None + + if job_salary: + salary_min, salary_max = extract_salary(job_salary) + + job_description, job_type, posted, skills_required, company_logo_url = fetch_job_details_simplyhired(job_link) + + time.sleep(1) + jd_extracted = extract_job_details_with_AI(job_description) + + experience_min = jd_extracted['experience_min'] + experience_max = jd_extracted['experience_max'] + experience = jd_extracted['experience'] + end_date = jd_extracted['end_date'] + + job_info = { + "title": title, + "company_name": company_name, + "company_logo": company_logo_url, + "job_link": job_link, + "job_location": job_location, + "job_salary": job_salary, + "salary_min": salary_min, + "salary_max": salary_max, + "job_description": job_description, + "job_type": job_type, + "posted": posted, + "skills_required": skills_required, + "experience_min": experience_min, + "experience_max": experience_max, + "experience": experience, + "end_date": end_date, + "source": portal + } + try: + # print(job_info) + await insert_job(job_info) + except Exception as e: + print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}") + return jobs_count + + +def fetch_job_details_simplyhired(job_url): + try: + proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={job_url}" + response = requests.get(proxy_url, headers=headers) + response.raise_for_status() + job_page = BeautifulSoup(response.text, 'html.parser') + + # Company Logo + company_logo = job_page.find('img', {'data-testid': 'companyVJLogo'}) + company_logo_url = company_logo['src'] if company_logo else None + + # Job Type + job_type_div = job_page.find('div', {'data-testid': 'viewJobBodyJobDetailsContainer'}) + job_type = job_type_div.find('span', {'data-testid': 'detailText'}).text + + # Posted TimeStamp + posted = None + # timestamp_span = job_page.find('span', {'data-testid': 'viewJobBodyJobPostingTimestamp'}) + # if timestamp_span: + # posted_str = timestamp_span.find('span', {'data-testid': 'detailText'}).text.strip() + # try: + # posted = datetime.strptime(posted_str, "%Y-%m-%d") + # except ValueError: + # if 'hours ago' in posted_str: + # hours = int(posted_str.split()[0]) + # posted = datetime.now() - timedelta(hours=hours) + + # Skills Required + qualifications_container = job_page.find('div', {'data-testid': 'viewJobQualificationsContainer'}) + qualification_items = qualifications_container.find_all('span', {'data-testid': 'viewJobQualificationItem'}) + skills_required = [item.text for item in qualification_items] + + # Job Description + job_description = job_page.find('div', {'data-testid': 'viewJobBodyJobFullDescriptionContent'}) + job_description = job_description.text.strip() if job_description else None + + return job_description, job_type, posted, skills_required, company_logo_url + + except requests.exceptions.RequestException as e: + print(f"Failed to fetch salary from {job_url}: {e}") + return None + + +searchKeywords = [ + "software developer", + "data scientist", + "full stack developer", + "python developer", + "project manager", + "machine learning engineer", + "data analyst", + "cloud engineer", + "frontend developer", + "backend developer", + "product manager", + "devops engineer", + "HR manager", + "digital marketing", + "business analyst", + "sales manager", + "AI research scientist", + "web developer", + "graphic designer", + "react developer" +] + +async def scrape_simplyhired(): + total_jobs = 0 + logger.info("Starting to scrape SimplyHired") + for keyword in searchKeywords: + keyword_hyphenated = keyword.replace(" ", "+") + base_url = "https://www.simplyhired.co.in/search" + currentJobs = 0 + for job_type in ["CF3CP", "VDTG7"]: + params = { + "q": keyword_hyphenated, + "l": "india", + "t": "1", + "jt": job_type + } + if job_type == "CF3CP": + params["mip"] = "830000" + + # Get first page + url = base_url + "?" + "&".join(f"{k}={v}" for k, v in params.items()) + page_count = 1 + + while True: + try: + logger.info(f"Processing page {page_count} for {keyword} ({'Full Time' if job_type == 'CF3CP' else 'Internship'})") + proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}" + response = requests.get(proxy_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + logger.info(f"Scraping page {page_count} for {keyword} ({'Full Time' if job_type == 'CF3CP' else 'Internship'})") + + currentJobs += await scrape_simplyhired_jobpage(soup) + print(f"Jobs fetched: {currentJobs} for {keyword} jobs on page {page_count}") + + pagination = soup.find('nav', {'data-testid': 'pageNumberContainer'}) + if not pagination: + break + + next_page = pagination.find('a', {'data-testid': f'paginationBlock{page_count + 1}'}) + if not next_page or page_count >= 10: # Limit to 10 pages + break + + url = next_page['href'] + page_count += 1 + # time.sleep(1) + + except Exception as e: + print(f"Error on page {page_count}: {str(e)}") + break + total_jobs += currentJobs + print(f"Jobs fetched: {total_jobs} for {keyword} jobs") + diff --git a/function/crawler/job_portals/upwork.py b/function/crawler/job_portals/upwork.py index 75d62bb..a854413 100644 --- a/function/crawler/job_portals/upwork.py +++ b/function/crawler/job_portals/upwork.py @@ -1,5 +1,5 @@ -from function.utils import createFile, fetch_job_details, extract_salary +from function.utils import createFile, fetch_job_details_linkedin, extract_salary from function.insert_job import insert_job from function.utils import extract_salary diff --git a/function/crawler/run_crawler.py b/function/crawler/run_crawler.py index 9bf3153..16c2145 100644 --- a/function/crawler/run_crawler.py +++ b/function/crawler/run_crawler.py @@ -1,33 +1,56 @@ import asyncio -from function.crawler.crawler import scrapejobsdata, scrape_workday_jobs +from function.crawler.crawler import scrapejobsdata, scrape_workday_jobs, scrape_linkedin_jobs, scrape_glassdoor_jobs, scrape_simplyhired_jobs searchKeywords = [ "software developer", - "data scientist", - "full stack developer", - "python developer", - "project manager", - "machine learning engineer", - "data analyst", - "cloud engineer", - "frontend developer", - "backend developer", - "product manager", - "devops engineer", - "HR manager", - "digital marketing", - "business analyst", - "sales manager", - "AI research scientist", - "web developer", - "graphic designer", - "react developer" + # "data scientist", + # "full stack developer", + # "python developer", + # "project manager", + # "machine learning engineer", + # "data analyst", + # "cloud engineer", + # "frontend developer", + # "backend developer", + # "product manager", + # "devops engineer", + # "HR manager", + # "digital marketing", + # "business analyst", + # "sales manager", + # "AI research scientist", + # "web developer", + # "graphic designer", + # "react developer" ] async def run_crawler(): - await scrape_workday_jobs() - for keyword in searchKeywords: - await scrapejobsdata(keyword) + # await scrape_workday_jobs() + # Once/Twice Daily + # 20 jobs, 70 Sec => + # Todo :- Add more workday links. + + + # LinkedIn + # await scrape_linkedin_jobs() + # Hourly + # Todo :- Gemini RPM Issue. + + + + # await scrape_glassdoor_jobs() + # Daily + # Todo :- Check & Scrape properly. + # - Add all Necessary Keywords + + await scrape_simplyhired_jobs() + # Daily + # Todo :- Fix posted_date + + + + # for keyword in searchKeywords: + # await scrapejobsdata(keyword) if __name__ == "__main__": asyncio.run(run_crawler()) diff --git a/function/insert_job.py b/function/insert_job.py index 783b2ab..9b96b32 100644 --- a/function/insert_job.py +++ b/function/insert_job.py @@ -77,12 +77,12 @@ def to_lowercase(value): # 🔹 Create job document job_document = { - "title": to_lowercase(job.get('title', 'N/A')), + "title": job.get('title', 'N/A'), "job_id": job_id, "job_link": job.get('job_link', 'N/A'), - "job_type": to_lowercase(job.get('job_type', 'N/A')), - "apply_link": job.get('apply_link'), - "job_location": to_lowercase(job.get('job_location', 'N/A')), + "job_type": job.get('job_type', 'N/A'), + # "apply_link": job.get('apply_link'), + "job_location": job.get('job_location', 'N/A'), "salary_min": salary_min, "salary_max": salary_max, "job_salary": job.get('job_salary'), @@ -91,7 +91,7 @@ def to_lowercase(value): "experience": job.get('experience'), "job_description": job.get('job_description'), "skills_required": ", ".join(job.get('skills_required', [])) if isinstance(job.get('skills_required'), list) else job.get('skills_required', 'N/A'), - "source": to_lowercase(job.get('source', 'N/A')), + "source": job.get('source', 'N/A'), "source_logo": job.get('source_logo'), "posted": posted_date, "end_date": end_date, @@ -102,9 +102,11 @@ def to_lowercase(value): # Check if the job already exists existing_job = await db.job.find_first(where={ "title": job_document['title'], - "companyId": job_document['companyId'] + "companyId": job_document['companyId'], + "status": "active" }) if existing_job: + logger.info(f"{job_document['title']} already exists for this company") return jsonify({"error": "Job already exists for this company"}), 400 # Return error or handle differently # Proceed to insert the job if it doesn't exist diff --git a/function/utils.py b/function/utils.py index 84fbd0e..82d9387 100644 --- a/function/utils.py +++ b/function/utils.py @@ -41,22 +41,37 @@ def createFile(file, title, company_name, job_link, job_location, job_descriptio print("Missing required fields (title, job_link, or job_location). Skipping this job.") -def fetch_job_details(job_url): +def fetch_job_details_linkedin(job_url): try: response = requests.get(job_url, headers=headers) response.raise_for_status() job_page = BeautifulSoup(response.text, 'html.parser') - with open("QJobpage_linkedin.html", "w", encoding="utf-8") as file: - file.write(job_page.prettify()) salary_element = job_page.find('div', class_='salary compensation__salary') salary = None if salary_element: salary = salary_element.text.strip() - + + # Legitimate company check + company_legit_check = None + logo_link = job_page.find("a", {"data-tracking-control-name": "public_jobs_topcard_logo"}) + + if logo_link: + logo_img = logo_link.find("img") + + if logo_img and "artdeco-entity-image--ghost" in logo_img.get("class", []): + company_legit_check = None + else: + company_legit_check = "Yes" + job_info = {} + job_description = None + job_description_div = job_page.find("div", class_="description__text") + if job_description_div: + job_description = job_description_div.get_text(strip=True) if job_description_div else None + for item in job_page.find_all("li", class_="description__job-criteria-item"): criterion = item.find("h3", class_="description__job-criteria-subheader").text.strip() value = item.find("span", class_="description__job-criteria-text").text.strip() @@ -64,7 +79,7 @@ def fetch_job_details(job_url): value = None job_info[criterion] = value - return salary, job_info['Seniority level'], job_info['Employment type'] + return salary, job_info['Seniority level'], job_info['Employment type'], job_description, company_legit_check except requests.exceptions.RequestException as e: print(f"Failed to fetch salary from {job_url}: {e}") diff --git a/requirements.txt b/requirements.txt index 0f01e88..db6ccd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,179 +1,29 @@ -acres==0.3.0 -aiohappyeyeballs==2.4.8 aiohttp==3.11.13 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.6.2.post1 -arabic-reshaper==3.0.0 -asgiref==3.8.1 -asn1crypto==1.5.1 -attrs==25.1.0 -axios==0.4.0 -bcrypt==4.2.0 -beautifulsoup4==4.12.3 -blinker==1.8.2 -blis==1.2.0 +beautifulsoup4==4.13.3 boto3==1.37.17 botocore==1.37.17 -Brotli==1.1.0 -cachetools==5.5.0 -catalogue==2.0.10 -certifi==2024.8.30 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.4.0 -ci-info==0.3.0 -click==8.1.7 -cloudpathlib==0.21.0 -confection==0.1.5 -configobj==5.0.9 -configparser==7.2.0 -cryptography==43.0.3 -cssselect2==0.8.0 -cymem==2.0.11 -dateparser==1.2.1 -DateTime==5.5 -defusedxml==0.7.1 -distro==1.9.0 -docx2txt==0.8 -en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 -etelemetry==0.3.1 -filelock==3.17.0 -Flask==3.0.3 -Flask-Cors==5.0.0 -fonttools==4.56.0 -frozenlist==1.5.0 -google-ai-generativelanguage==0.6.10 -google-api-core==2.24.0 -google-api-python-client==2.158.0 -google-auth==2.37.0 -google-auth-httplib2==0.2.0 -google-generativeai==0.8.3 -googleapis-common-protos==1.66.0 -grpcio==1.69.0 -grpcio-status==1.69.0 -h11==0.14.0 -h2==4.1.0 -hpack==4.0.0 -html5lib==1.1 -httpcore==1.0.6 -httplib2==0.22.0 -httpx==0.27.2 -Hypercorn==0.17.3 -hyperframe==6.0.1 -idna==3.10 -isodate==0.6.1 -itsdangerous==2.2.0 -Jinja2==3.1.4 -jiter==0.8.2 -jmespath==1.0.1 -jwt==1.3.1 -langcodes==3.5.0 -language_data==1.3.0 -looseversion==1.3.0 -lxml==5.3.0 -marisa-trie==1.2.1 -markdown-it-py==3.0.0 -MarkupSafe==3.0.2 -mdurl==0.1.2 -multidict==6.1.0 -murmurhash==1.0.12 -networkx==3.4.2 -nibabel==5.3.2 -nipype==1.9.2 -nodeenv==1.9.1 -numpy==2.2.3 -openai==1.59.6 -oscrypto==1.3.0 -outcome==1.3.0.post0 -packaging==24.2 -pandas==2.2.3 -pathlib==1.0.1 -pdfminer.six==20231228 +Flask==3.1.0 +Flask_Cors==5.0.0 +httpx==0.28.1 +Jinja2==3.1.6 pdfplumber==0.11.5 -pillow==11.1.0 -preshed==3.0.9 -priority==2.0.0 -prisma==0.15.0 -propcache==0.3.0 -proto-plus==1.25.0 protobuf==5.29.3 -prov==2.0.1 -puremagic==1.28 -pyasn1==0.6.1 -pyasn1_modules==0.4.1 -pycparser==2.22 pycron==3.1.2 -pydantic==2.9.2 -pydantic_core==2.23.4 -pydot==3.0.4 -pydyf==0.11.0 -Pygments==2.18.0 -pyHanko==0.26.0 -pyhanko-certvalidator==0.26.8 PyJWT==2.10.1 -PyMuPDF==1.25.3 -pyparsing==3.2.1 -pypdf==5.4.0 -PyPDF2==3.0.1 -pypdfium2==4.30.1 -pyphen==0.17.2 -PySocks==1.7.1 -python-bidi==0.6.6 -python-dateutil==2.9.0.post0 -python-docx==1.1.2 python-dotenv==1.0.1 -pytz==2024.2 -pyxnat==1.6.3 -PyYAML==6.0.2 -qrcode==8.0 -rdflib==6.3.2 -regex==2024.11.6 +python_bcrypt==0.3.2 +python_docx==1.1.2 reportlab==4.3.1 -requests==2.32.3 -rich==13.9.4 -rsa==4.9 -s3transfer==0.11.4 -scipy==1.15.2 -selenium==4.28.1 -setuptools==75.8.2 -shellingham==1.5.4 -simplejson==3.20.1 -six==1.17.0 -smart-open==7.1.0 -sniffio==1.3.1 -sortedcontainers==2.4.0 -soupsieve==2.6 -spacy==3.8.4 -spacy-legacy==3.0.12 -spacy-loggers==1.0.5 -srsly==2.5.1 -svglib==1.5.1 -thinc==8.3.4 -tinycss2==1.4.0 -tinyhtml5==2.0.0 -tomlkit==0.13.2 -tqdm==4.67.1 -traits==7.0.2 -trio==0.29.0 -trio-websocket==0.12.2 -typer==0.15.2 -typing_extensions==4.12.2 -tzdata==2025.1 -tzlocal==5.3.1 -uritemplate==4.1.1 -uritools==4.0.3 -urllib3==2.2.3 -wasabi==1.1.3 -weasel==0.4.1 -weasyprint==64.1 -webdriver-manager==4.0.2 -webencodings==0.5.1 -websocket-client==1.8.0 -Werkzeug==3.0.4 -wrapt==1.17.2 -wsproto==1.2.0 +Requests==2.32.3 +selenium==4.30.0 +webdriver_manager==4.0.2 xhtml2pdf==0.2.17 -yarl==1.18.3 -zope.interface==7.1.1 -zopfli==0.2.3.post1 +gunicorn==21.2.0 +PyMuPDF==1.23.26 +google-ai-generativelanguage==0.6.10 +google-api-core==2.24.0 +google-api-python-client==2.158.0 +google-auth==2.37.0 +google-auth-httplib2==0.2.0 +google-generativeai==0.8.3 +googleapis-common-protos==1.66.0 \ No newline at end of file