Jobflow-AI · rameshchandra8520 · Mar 27, 2025 · Mar 23, 2025 · Mar 23, 2025 · Mar 24, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -1,31 +1,44 @@
 # Stage 1: Build Stage
-FROM python:3.9-slim AS builder
+FROM python:3.9.18-slim AS builder
 
-# Set environment variables to prevent .pyc files and enable unbuffered logging
+# Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1
 
-# Install build dependencies
+# Install build dependencies including PyMuPDF requirements
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
+    libmupdf-dev \
+    python3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 # Set working directory
 WORKDIR /jobflow-api
 
 # Copy dependency file and install dependencies
 COPY requirements.txt .
-RUN pip install --upgrade pip && pip install -r requirements.txt
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt && \
+    pip install gunicorn  # Add explicit gunicorn installation
 
-# Generate Prisma client
+# Generate Prisma client (Fixed command)
 COPY db/ ./db/
-RUN pip install prisma && python -c "from prisma import generate_client; generate_client()"
+COPY prisma/ ./prisma/
+RUN pip install prisma && \
+    prisma generate --schema=prisma/schema.prisma
 
 # Copy the rest of the application code
 COPY . .
 
 # Stage 2: Final Image
-FROM python:3.9-slim
+FROM python:3.9.18-slim
+
+# Install runtime dependencies for PyMuPDF
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    libmupdf-dev \
+    && rm -rf /var/lib/apt/lists/*
 
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
@@ -45,13 +58,10 @@ WORKDIR /jobflow-api
 # Copy only the installed packages and application from the builder stage
 COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
 COPY --from=builder /jobflow-api /jobflow-api
+# Copy gunicorn binary specifically
 COPY --from=builder /usr/local/bin/gunicorn /usr/local/bin/
 
 # Expose the port your app runs on
 EXPOSE 5001
-
-# Add a health check endpoint
-RUN echo 'from flask import Blueprint\nhealth_blueprint = Blueprint("health", __name__)\n@health_blueprint.route("/health")\ndef health_check():\n    return {"status": "healthy"}, 200' > /jobflow-api/controllers/health.py
-
 # Use Gunicorn to run the app
 CMD ["gunicorn", "--workers", "4", "--timeout", "300", "--bind", "0.0.0.0:5001", "app:app"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   web:
     build:
@@ -11,9 +9,14 @@ services:
       - DATABASE_URL=${DATABASE_URL}
       - SCRAPER_API=${SCRAPER_API}
       - JWT_SECRET=${JWT_SECRET}
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}
+      - JOBFLOW_AWS_REGION=${JOBFLOW_AWS_REGION}
+      - JOBFLOW_AWS_ACCESS_KEY=${JOBFLOW_AWS_ACCESS_KEY}
+      - JOBFLOW_AWS_SECRET_KEY=${JOBFLOW_AWS_SECRET_KEY}
+      - AWS_BUCKET_NAME=${AWS_BUCKET_NAME}
     restart: always
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:5001/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:5001/api"]
       interval: 30s
       timeout: 10s
       retries: 3

diff --git a/function/__init__.py b/function/__init__.py
@@ -1,4 +1,4 @@
 # from .insert_job import insert_job
-# from .crawler.crawler import scrapejobsdata, fetch_job_details
+# from .crawler.crawler import scrapejobsdata, fetch_job_details_linkedin
 
-# __all__ = ['scrapejobsdata', 'fetch_job_details']
+# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin']
diff --git a/function/aiHelper.py b/function/aiHelper.py
@@ -14,8 +14,10 @@
 - job_salary (string or null): Full salary text (e.g., 12L - 15L annually) or null if not found.
 - experience_min (int or null): Minimum years of experience (e.g., 2) or null if not found.
 - experience_max (int or null): Maximum years of experience (e.g., 5) or null if not found.
-- experience (string or null): Experience level (e.g., "mid-level", "Trainee", "new-grad", "fresher", "Experienced") understand what it could be based on the min & max experience and based on JD & role. If its hard or less precise return null.
+- experience (string or null): Experience level (e.g., "Internship", "Entry-level", "Mid-level", "Experienced") understand what it could be based on the experience_min and experience_max and based on JD & role. If its hard or less precise return null.
 - skills_required (list of strings or null): List of skills (e.g., ["Python", "SQL"]) or null if not found.
+- job_type (string or null): Type of job (e.g., "Full-time", "Part-time", "Contract") or null if not found.
+- end_date (string or null): End date of the job (e.g., "2024-12-31") or null if not found.
 
 Return the result as a JSON object. If a field cannot be determined, use null. Be precise and avoid guessing.
 """
@@ -50,4 +52,6 @@ def extract_job_details_with_AI(job_description):
             "experience_max": None,
             "experience": None,
             "skills_required": None,
+            "job_type": None,
+            "end_date": None,
         }
diff --git a/function/crawler/__init__.py b/function/crawler/__init__.py
@@ -1,5 +1,5 @@
 # from .crawler import scrapejobsdata
-# from .crawler import fetch_job_details
+# from .crawler import fetch_job_details_linkedin
 # from .crawler import createFile
 
-# __all__ = ['scrapejobsdata', 'fetch_job_details', 'createFile']
+# __all__ = ['scrapejobsdata', 'fetch_job_details_linkedin', 'createFile']
diff --git a/function/crawler/crawler.py b/function/crawler/crawler.py
@@ -28,18 +28,26 @@
 async def scrape_workday_jobs():
     await scrape_workday()
 
+async def scrape_linkedin_jobs():
+    await scrape_linkedin()
+
+async def scrape_glassdoor_jobs():
+    await scrape_glassdoor()
+
+async def scrape_simplyhired_jobs():
+    await scrape_simplyhired()
+
 async def scrapejobsdata(searchKeyword):
     searchKeyword = urllib.parse.quote(searchKeyword)  # Encodes spaces as %20
 
     jobPortals = {
-        "glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc",
-        "linkedin": f"https://www.linkedin.com/jobs/search/?f_TPR=r2000&geoId=102713980&sortBy=DD",
-        "simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000",  # Use only important keywords & scrape
-        "indeed": f"https://in.indeed.com/jobs?q={searchKeyword}",
+        # "glassdoor": f"https://www.glassdoor.co.in/Job/india-{searchKeyword}-jobs-SRCH_IL.0,5_IN115_KO6,27.htm?sc.keyword={searchKeyword}&sortBy=date_desc",
+        # "simplyhired": f"https://www.simplyhired.co.in/search?q={searchKeyword}&l=india&s=d&jt=CF3CP&t=1&mip=555000",  # Use only important keywords & scrape
+        # "indeed": f"https://in.indeed.com/jobs?q={searchKeyword}",
         # "ycombinator": f"https://www.workatastartup.com/companies?query={searchKeyword}&sortBy=keyword",   # Need signin
         # "internshala": f"https://internshala.com/jobs/salary-7/",   # Not in sorted order or filter by date.
-        "upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}",
-        "freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}",
+        # "upwork": f"https://www.upwork.com/nx/search/jobs/?q={searchKeyword}",
+        # "freelancer": f"https://www.freelancer.com/search/projects?q={searchKeyword}",
         # "naukri": f"https://www.naukri.com/jobs-in-india?jobAge=1",   # Use all filters to limit quality jobs
         # "foundit": f"https://www.foundit.in/srp/results?query={searchKeyword}",   # Proxy issue
     }
@@ -63,10 +71,10 @@ async def scrapejobsdata(searchKeyword):
 
             if portal == 'linkedin':
                 print(portal)
-                await scrape_linkedin(soup)
+                # await scrape_linkedin(soup)
 
-            elif portal == 'glassdoor':
-                await scrape_glassdoor(soup)
+            # elif portal == 'glassdoor':
+                # await scrape_glassdoor(soup)
 
             elif portal == 'indeed':
                 await scrape_indeed(soup)

diff --git a/function/crawler/job_portals/foundit.py b/function/crawler/job_portals/foundit.py
@@ -1,4 +1,4 @@
-from function.utils import createFile, fetch_job_details
+from function.utils import createFile, fetch_job_details_linkedin
 from function.insert_job import insert_job
 
 async def scrape_foundit(soup):

diff --git a/function/crawler/job_portals/freelancer.py b/function/crawler/job_portals/freelancer.py
@@ -1,4 +1,4 @@
-from function.utils import createFile, fetch_job_details, extract_salary
+from function.utils import createFile, fetch_job_details_linkedin, extract_salary
 from function.insert_job import insert_job
 from function.utils import extract_salary
 

diff --git a/function/crawler/job_portals/glassdoor.py b/function/crawler/job_portals/glassdoor.py
@@ -1,19 +1,64 @@
-from function.utils import createFile, fetch_job_details, extract_salary
+from function.utils import createFile, fetch_job_details_linkedin, extract_salary
 from function.insert_job import insert_job
 from function.utils import extract_salary
+from dotenv import load_dotenv
+import os
+import logging
+import requests
+from bs4 import BeautifulSoup
+import urllib
+import time
+from db.prisma import db
+from function.aiHelper import extract_job_details_with_AI
 
-async def scrape_glassdoor(soup):
+
+logging.basicConfig(filename= 'log.txt',  level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+scraperapi_key = os.getenv('SCRAPER_API')
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+}
+
+async def scrape_glassdoor_jobpage(soup):
     portal = 'glassdoor'
-    print("inside glassdoor scrape")
+    print("inside glassdoor scrape \n")
     job_list = soup.find('ul', class_='JobsList_jobsList__lqjTr')
+    jobs_count = 0
     if job_list:
         jobs = job_list.find_all('li')
         with open(f"{portal}_jobs.txt", "w", encoding="utf-8") as file:
             for job in jobs:
                 title_element = job.find('a', class_='JobCard_jobTitle__GLyJ1')
                 title = title_element.text.strip() if title_element else None
+                if title is None:
+                    continue
                 company_name_element = job.find('span', class_='EmployerProfile_compactEmployerName__9MGcV')
                 company_name = company_name_element.text.strip() if company_name_element else None
+
+                if company_name:
+                    normalized_company_name = company_name.strip().lower()
+                    company = await db.company.find_unique(where={'company_name': normalized_company_name})
+
+                    if company and title:
+                        existing_job = await db.job.find_first(where={
+                            "title": title,
+                            "companyId": company.id,
+                            "status": "active"
+                        })
+
+                        if existing_job:
+                            logger.info(f"Job '{title}' already exists for company '{company_name}'")
+                            continue 
+
+                if not company_name:
+                    company_name = "Unknown Company"
+
+                logger.info(f"Title: {title}, Company Name: {company_name}")
+                jobs_count += 1
                 job_link = title_element['href'] if title_element else None
                 job_location_element = job.find('div', class_='JobCard_location__Ds1fM')
                 job_location = job_location_element.text.strip() if job_location_element else None
@@ -27,19 +72,20 @@ async def scrape_glassdoor(soup):
 
                 company_logo = logo_url
 
-                job_description_section = job.find('div', class_='JobCard_jobDescriptionSnippet__l1tnl')
-                job_description = None
-                skills = []
-                if job_description_section:
-                    description_divs = job_description_section.find_all('div')
-                    if len(description_divs) > 0:
-                        job_description = description_divs[0].text.strip()
-                    if len(description_divs) > 1:
-                        skills_section = description_divs[1]
-                        if skills_section:
-                            skills_text = skills_section.text.strip().replace('Skills:', '').strip()
-                            skills = [skill.strip() for skill in skills_text.split(',')]
+                time.sleep(2)
+                job_description = fetch_job_details_glassdoor(job_link)
+                job_description = job_description.get_text(strip=True)
+
+                jd_extracted = extract_job_details_with_AI(job_description)
 
+                skills = jd_extracted['skills_required']
+                experience_min = jd_extracted['experience_min']
+                experience_max = jd_extracted['experience_max']
+                experience = jd_extracted['experience']
+                end_date = jd_extracted['end_date']
+                job_type = jd_extracted['job_type']
+
+
                 salary_min = None
                 salary_max = None
 
@@ -57,12 +103,88 @@ async def scrape_glassdoor(soup):
                         "salary_min": salary_min,
                         "salary_max": salary_max,
                         "skills_required": skills,
-                        "experience_level": None,
+                        "experience_level": experience,
+                        "experience_min": experience_min,
+                        "experience_max": experience_max,
+                        "end_date": end_date,
+                        "job_type": job_type,
                         "job_description": job_description,
                         "source": portal
                     }
                     try:
                         await insert_job(job_info)
-                        createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None)
+                        # createFile(file, title, company_name, job_link, job_location, job_description, skills, None, job_salary, portal, None)
                     except Exception as e:
                         print(f"Error inserting job for {job_info.get('title', 'unknown')}: {e}")
+    return jobs_count
+
+def fetch_job_details_glassdoor(job_url):
+    try:
+        proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={job_url}"
+        response = requests.get(proxy_url, headers=headers)
+        response.raise_for_status()
+        job_page = BeautifulSoup(response.text, 'html.parser')
+
+        job_description = job_page.find('div', class_=lambda x: x and 'JobDetails_jobDescription__uW_fK' in x)
+        return job_description
+
+    except requests.exceptions.RequestException as e:
+        print(f"Failed to fetch salary from {job_url}: {e}")
+        return None
+
+
+
+searchKeywords = [
+    "software developer",
+    "data scientist",
+    "full stack developer",
+    "python developer",
+    "project manager",
+    "machine learning engineer",
+    "data analyst",
+    "cloud engineer",
+    "frontend developer",
+    "backend developer",
+    "product manager",
+    "devops engineer",
+    "HR manager",
+    "digital marketing",
+    "business analyst",
+    "sales manager",
+    "AI research scientist",
+    "web developer",
+    "graphic designer",
+    "react developer"
+]
+
+async def scrape_glassdoor():
+    total_jobs = 0
+
+    for keyword in searchKeywords:
+        keyword_hyphenated = keyword.replace(" ", "-")
+        ko_end = 6 + len(keyword_hyphenated) + 1  
+        base_url = "https://www.glassdoor.co.in/Job/india-{keyword}-jobs-SRCH_IL.0,5_IN115_KO6,{ko_end}.htm"
+        seniority_types = ["entrylevel", "internship"]
+        currentJobs = 0
+        for seniority in seniority_types:
+            params = {
+                "maxSalary": 6000000,
+                "minSalary": 10000,
+                "fromAge": 1,
+                "sortBy": "date_desc",
+                "seniorityType": seniority
+            }
+            url = base_url.format(keyword=keyword_hyphenated, ko_end=ko_end)
+            url += "?" + "&".join(f"{k}={v}" for k, v in params.items())
+
+            proxy_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}"
+            response = requests.get(proxy_url, headers=headers)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            print(f"URL for {seniority}: {url}")
+            time.sleep(2)
+            currentJobs += await scrape_glassdoor_jobpage(soup)
+        total_jobs += currentJobs
+        print(f"Jobs fetched: {currentJobs} for {keyword} jobs")
+
+    print(f"Total jobs fetched: {total_jobs} on glassdoor")
diff --git a/function/crawler/job_portals/indeed.py b/function/crawler/job_portals/indeed.py
@@ -1,4 +1,4 @@
-from function.utils import createFile, fetch_job_details
+from function.utils import createFile, fetch_job_details_linkedin
 from function.insert_job import insert_job
 async def scrape_indeed(soup):
     portal = 'indeed'

diff --git a/function/crawler/job_portals/internshala.py b/function/crawler/job_portals/internshala.py
@@ -1,5 +1,5 @@
 
-from function.utils import createFile, fetch_job_details
+from function.utils import createFile, fetch_job_details_linkedin
 from function.insert_job import insert_job
 async def scrape_internshala(soup):
     portal = 'internshala'