Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
# Stage 1: Build Stage
FROM python:3.9-slim AS builder
FROM python:3.9.18-slim AS builder

# Set environment variables to prevent .pyc files and enable unbuffered logging
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1

# Install build dependencies
# Install build dependencies including PyMuPDF requirements
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libmupdf-dev \
python3-dev \
&& rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /jobflow-api

# Copy dependency file and install dependencies
COPY requirements.txt .
RUN pip install --upgrade pip && pip install -r requirements.txt
RUN pip install --upgrade pip && \
pip install -r requirements.txt && \
pip install gunicorn # Add explicit gunicorn installation

# Generate Prisma client
# Generate Prisma client (Fixed command)
COPY db/ ./db/
RUN pip install prisma && python -c "from prisma import generate_client; generate_client()"
COPY prisma/ ./prisma/
RUN pip install prisma && \
prisma generate --schema=prisma/schema.prisma

# Copy the rest of the application code
COPY . .

# Stage 2: Final Image
FROM python:3.9-slim
FROM python:3.9.18-slim

# Install runtime dependencies for PyMuPDF
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
libmupdf-dev \
&& rm -rf /var/lib/apt/lists/*

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
Expand All @@ -45,13 +58,10 @@ WORKDIR /jobflow-api
# Copy only the installed packages and application from the builder stage
COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
COPY --from=builder /jobflow-api /jobflow-api
# Copy gunicorn binary specifically
COPY --from=builder /usr/local/bin/gunicorn /usr/local/bin/

# Expose the port your app runs on
EXPOSE 5001

# Add a health check endpoint
RUN echo 'from flask import Blueprint\nhealth_blueprint = Blueprint("health", __name__)\n@health_blueprint.route("/health")\ndef health_check():\n return {"status": "healthy"}, 200' > /jobflow-api/controllers/health.py

# Use Gunicorn to run the app
CMD ["gunicorn", "--workers", "4", "--timeout", "300", "--bind", "0.0.0.0:5001", "app:app"]
9 changes: 6 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3.8'

services:
web:
build:
Expand All @@ -11,9 +9,14 @@ services:
- DATABASE_URL=${DATABASE_URL}
- SCRAPER_API=${SCRAPER_API}
- JWT_SECRET=${JWT_SECRET}
- GOOGLE_API_KEY=${GOOGLE_API_KEY}
- JOBFLOW_AWS_REGION=${JOBFLOW_AWS_REGION}
- JOBFLOW_AWS_ACCESS_KEY=${JOBFLOW_AWS_ACCESS_KEY}
- JOBFLOW_AWS_SECRET_KEY=${JOBFLOW_AWS_SECRET_KEY}
- AWS_BUCKET_NAME=${AWS_BUCKET_NAME}
restart: always
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5001/health"]
test: ["CMD", "curl", "-f", "http://localhost:5001/api"]
interval: 30s
timeout: 10s
retries: 3
Expand Down
4 changes: 2 additions & 2 deletions function/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# from .insert_job import insert_job
# from .crawler.crawler import scrapejobsdata, fetch_job_details
# from .crawler.crawler import scrapejobsdata, fetch_job_details_linkedin

# __all__ = ['scrapejobsdata', 'fetch_job_details']
# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin']
6 changes: 5 additions & 1 deletion function/aiHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
- job_salary (string or null): Full salary text (e.g., 12L - 15L annually) or null if not found.
- experience_min (int or null): Minimum years of experience (e.g., 2) or null if not found.
- experience_max (int or null): Maximum years of experience (e.g., 5) or null if not found.
- experience (string or null): Experience level (e.g., "mid-level", "Trainee", "new-grad", "fresher", "Experienced") understand what it could be based on the min & max experience and based on JD & role. If its hard or less precise return null.
- experience (string or null): Experience level (e.g., "Internship", "Entry-level", "Mid-level", "Experienced") understand what it could be based on the experience_min and experience_max and based on JD & role. If its hard or less precise return null.
- skills_required (list of strings or null): List of skills (e.g., ["Python", "SQL"]) or null if not found.
- job_type (string or null): Type of job (e.g., "Full-time", "Part-time", "Contract") or null if not found.
- end_date (string or null): End date of the job (e.g., "2024-12-31") or null if not found.

Return the result as a JSON object. If a field cannot be determined, use null. Be precise and avoid guessing.
"""
Expand Down Expand Up @@ -50,4 +52,6 @@ def extract_job_details_with_AI(job_description):
"experience_max": None,
"experience": None,
"skills_required": None,
"job_type": None,
"end_date": None,
}
4 changes: 2 additions & 2 deletions function/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# from .crawler import scrapejobsdata
# from .crawler import fetch_job_details
# from .crawler import fetch_job_details_linkedin
# from .crawler import createFile

# __all__ = ['scrapejobsdata', 'fetch_job_details', 'createFile']
# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin', 'createFile']
26 changes: 17 additions & 9 deletions function/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,26 @@
async def scrape_workday_jobs():
await scrape_workday()

async def scrape_linkedin_jobs():
await scrape_linkedin()

async def scrape_glassdoor_jobs():
await scrape_glassdoor()

async def scrape_simplyhired_jobs():
await scrape_simplyhired()

async def scrapejobsdata(searchKeyword):
searchKeyword = urllib.parse.quote(searchKeyword) # Encodes spaces as %20

jobPortals = {
"glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc",
"linkedin": f"https://www.linkedin.com/jobs/search/?f_TPR=r2000&geoId=102713980&sortBy=DD",
"simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000", # Use only important keywords & scrape
"indeed": f"https://in.indeed.com/jobs?q={searchKeyword}",
# "glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc",
# "simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000", # Use only important keywords & scrape
# "indeed": f"https://in.indeed.com/jobs?q={searchKeyword}",
# "ycombinator": f"https://www.workatastartup.com/companies?query={searchKeyword}&sortBy=keyword", # Need signin
# "internshala": f"https://internshala.com/jobs/salary-7/", # Not in sorted order or filter by date.
"upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}",
"freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}",
# "upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}",
# "freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}",
# "naukri": f"https://www.naukri.com/jobs-in-india?jobAge=1", # Use all filters to limit quality jobs
# "foundit": f"https://www.foundit.in/srp/results?query={searchKeyword}", # Proxy issue
}
Expand All @@ -63,10 +71,10 @@ async def scrapejobsdata(searchKeyword):

if portal == 'linkedin':
print(portal)
await scrape_linkedin(soup)
# await scrape_linkedin(soup)

elif portal == 'glassdoor':
await scrape_glassdoor(soup)
# elif portal == 'glassdoor':
# await scrape_glassdoor(soup)

elif portal == 'indeed':
await scrape_indeed(soup)
Expand Down
2 changes: 1 addition & 1 deletion function/crawler/job_portals/foundit.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from function.utils import createFile, fetch_job_details
from function.utils import createFile, fetch_job_details_linkedin
from function.insert_job import insert_job

async def scrape_foundit(soup):
Expand Down
2 changes: 1 addition & 1 deletion function/crawler/job_portals/freelancer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from function.utils import createFile, fetch_job_details, extract_salary
from function.utils import createFile, fetch_job_details_linkedin, extract_salary
from function.insert_job import insert_job
from function.utils import extract_salary

Expand Down
156 changes: 139 additions & 17 deletions function/crawler/job_portals/glassdoor.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,64 @@
from function.utils import createFile, fetch_job_details, extract_salary
from function.utils import createFile, fetch_job_details_linkedin, extract_salary
from function.insert_job import insert_job
from function.utils import extract_salary
from dotenv import load_dotenv
import os
import logging
import requests
from bs4 import BeautifulSoup
import urllib
import time
from db.prisma import db
from function.aiHelper import extract_job_details_with_AI

async def scrape_glassdoor(soup):

logging.basicConfig(filename= 'log.txt', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()

scraperapi_key = os.getenv('SCRAPER_API')

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

async def scrape_glassdoor_jobpage(soup):
portal = 'glassdoor'
print("inside glassdoor scrape")
print("inside glassdoor scrape \n")
job_list = soup.find('ul', class_='JobsList_jobsList__lqjTr')
jobs_count = 0
if job_list:
jobs = job_list.find_all('li')
with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file:
for job in jobs:
title_element = job.find('a', class_='JobCard_jobTitle__GLyJ1')
title = title_element.text.strip() if title_element else None
if title is None:
continue
company_name_element = job.find('span', class_='EmployerProfile_compactEmployerName__9MGcV')
company_name = company_name_element.text.strip() if company_name_element else None

if company_name:
normalized_company_name = company_name.strip().lower()
company = await db.company.find_unique(where={'company_name': normalized_company_name})

if company and title:
existing_job = await db.job.find_first(where={
"title": title,
"companyId": company.id,
"status": "active"
})

if existing_job:
logger.info(f"Job '{title}' already exists for company '{company_name}'")
continue

if not company_name:
company_name = "Unknown Company"

logger.info(f"Title: {title}, Company Name: {company_name}")
jobs_count += 1
job_link = title_element['href'] if title_element else None
job_location_element = job.find('div', class_='JobCard_location__Ds1fM')
job_location = job_location_element.text.strip() if job_location_element else None
Expand All @@ -27,19 +72,20 @@ async def scrape_glassdoor(soup):

company_logo = logo_url

job_description_section = job.find('div', class_='JobCard_jobDescriptionSnippet__l1tnl')
job_description = None
skills = []
if job_description_section:
description_divs = job_description_section.find_all('div')
if len(description_divs) > 0:
job_description = description_divs[0].text.strip()
if len(description_divs) > 1:
skills_section = description_divs[1]
if skills_section:
skills_text = skills_section.text.strip().replace('Skills:', '').strip()
skills = [skill.strip() for skill in skills_text.split(',')]
time.sleep(2)
job_description = fetch_job_details_glassdoor(job_link)
job_description = job_description.get_text(strip=True)

jd_extracted = extract_job_details_with_AI(job_description)

skills = jd_extracted['skills_required']
experience_min = jd_extracted['experience_min']
experience_max = jd_extracted['experience_max']
experience = jd_extracted['experience']
end_date = jd_extracted['end_date']
job_type = jd_extracted['job_type']


salary_min = None
salary_max = None

Expand All @@ -57,12 +103,88 @@ async def scrape_glassdoor(soup):
"salary_min": salary_min,
"salary_max": salary_max,
"skills_required": skills,
"experience_level": None,
"experience_level": experience,
"experience_min": experience_min,
"experience_max": experience_max,
"end_date": end_date,
"job_type": job_type,
"job_description": job_description,
"source": portal
}
try:
await insert_job(job_info)
createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None)
# createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None)
except Exception as e:
print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}")
return jobs_count

def fetch_job_details_glassdoor(job_url):
try:
proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={job_url}"
response = requests.get(proxy_url, headers=headers)
response.raise_for_status()
job_page = BeautifulSoup(response.text, 'html.parser')

job_description = job_page.find('div', class_=lambda x: x and 'JobDetails_jobDescription__uW_fK' in x)
return job_description

except requests.exceptions.RequestException as e:
print(f"Failed to fetch salary from {job_url}: {e}")
return None



searchKeywords = [
"software developer",
"data scientist",
"full stack developer",
"python developer",
"project manager",
"machine learning engineer",
"data analyst",
"cloud engineer",
"frontend developer",
"backend developer",
"product manager",
"devops engineer",
"HR manager",
"digital marketing",
"business analyst",
"sales manager",
"AI research scientist",
"web developer",
"graphic designer",
"react developer"
]

async def scrape_glassdoor():
total_jobs = 0

for keyword in searchKeywords:
keyword_hyphenated = keyword.replace(" ", "-")
ko_end = 6 + len(keyword_hyphenated) + 1
base_url = "https://www.glassdoor.co.in/Job/india-{keyword}-jobs-SRCH_IL.0,5_IN115_KO6,{ko_end}.htm"
seniority_types = ["entrylevel", "internship"]
currentJobs = 0
for seniority in seniority_types:
params = {
"maxSalary": 6000000,
"minSalary": 10000,
"fromAge": 1,
"sortBy": "date_desc",
"seniorityType": seniority
}
url = base_url.format(keyword=keyword_hyphenated, ko_end=ko_end)
url += "?" + "&".join(f"{k}={v}" for k, v in params.items())

proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}"
response = requests.get(proxy_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
print(f"URL for {seniority}: {url}")
time.sleep(2)
currentJobs += await scrape_glassdoor_jobpage(soup)
total_jobs += currentJobs
print(f"Jobs fetched: {currentJobs} for {keyword} jobs")

print(f"Total jobs fetched: {total_jobs} on glassdoor")
2 changes: 1 addition & 1 deletion function/crawler/job_portals/indeed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from function.utils import createFile, fetch_job_details
from function.utils import createFile, fetch_job_details_linkedin
from function.insert_job import insert_job
async def scrape_indeed(soup):
portal = 'indeed'
Expand Down
2 changes: 1 addition & 1 deletion function/crawler/job_portals/internshala.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

from function.utils import createFile, fetch_job_details
from function.utils import createFile, fetch_job_details_linkedin
from function.insert_job import insert_job
async def scrape_internshala(soup):
portal = 'internshala'
Expand Down
Loading
Loading