From 3620f8e1e86d38a572a8379658fdbf3888ecc3e0 Mon Sep 17 00:00:00 2001 From: barkat-10 Date: Fri, 2 May 2025 13:33:09 -0500 Subject: [PATCH 1/3] enricher part half complete --- crawler/enricher.ts | 191 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 crawler/enricher.ts diff --git a/crawler/enricher.ts b/crawler/enricher.ts new file mode 100644 index 0000000..90cf9f3 --- /dev/null +++ b/crawler/enricher.ts @@ -0,0 +1,191 @@ +import {Builder} from 'selenium-webdriver'; // Builder() is a Selenium class used to construct a WebDriver instance. +import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome'; //we are supposed to mention the browser we will use +import chromedriver from 'chromedriver'; +import { Client } from 'pg'; +import 'dotenv/config'; // get the dot env file + +const client = new Client({ + host: process.env.PGHOST, + port: parseInt(process.env.PGPORT || '5432', 10), + user: process.env.PGUSER, + password: process.env.PGPASSWORD, + database: process.env.PGDATABASE + }); + + async function main(){ + try{ + + await client.connect(); + console.log('Connected to PostgreSQL'); + + const fullName = 'Aashna Gajaria'; + + const crawlerRes = await client.query( + `SELECT json FROM crawler_data WHERE json->>'fullName' = $1 LIMIT 1`, + [fullName] + ); + + if (crawlerRes.rows.length === 0) { + console.log(`No crawler data found for ${fullName}`); + return; + } + + const data = crawlerRes.rows[0].json; + + // Step 2: Insert relevant fields into enricher_data + await client.query(` + INSERT INTO enricher_data ( + profile_url, + timestamp, + full_name, + email, + phone_number, + high_school, + hs_graduation_year, + naf_academy, + naf_track_certified, + city, + current_job, + university, + degree, + linkedin_link, + university_grad_year, + internship_company1, + internship_end_date1 + ) VALUES ( + $1, CURRENT_TIMESTAMP, $2, $3, $4, $5, $6, $7, $8, + $9, $10, $11, $12, $13, $14, $15, NULL + ); + `, [ + data.linkedinLink, // profile_url + data.fullName, // full_name + data.email, // email + data.phoneNumber, // phone_number + data.highSchool, // high_school + data.HSGraduationYear, // hs_graduation_year + data.NAFAcademy, // naf_academy + data.NAFTrackCertified, // naf_track_certified + data.city, // city + data.currentJob, // current_job + data.university, // university + data.degree, // degree + data.linkedinLink, // linkedin_link + data.universityGradYear, // university_grad_year + data.internshipCompany1 // internship_company1 + ]); + + console.log(`Successfully enriched data for ${fullName}`); + + } + catch (err) { + console.error('Error:', err); + } + finally{ + await client.end(); + } + } + + main(); + +const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); +const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. + +interface Person{ + fullName: String; + email?: string; + phone?: string; + school?: string; +} + +interface SearchResult { + items?: { link: string }[]; + error?: any; + } +const person: Person = { + fullName: 'Shahreen Iqbal', + email: 'singhbarkat1011@gmail.com', + phone: '', + school: 'University Of Texas, Dallas', +} + + +async function findLinkedinProfile(person: Person): Promise { + const { fullName, school } = person; + const [firstName, ...lastNameParts] = fullName.split(" "); + const lastName = lastNameParts.join(" "); + + const apiKey = process.env.API_KEY; + const searchEngineId = process.env.SEARCH_ENGINE_ID; + + if (!apiKey || !searchEngineId) { + console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); + process.exit(1); + } + + const queryVariations = [ + `site:linkedin.com/in "${fullName}"`, + `site:linkedin.com/in "${firstName} ${lastName}"`, + school ? `site:linkedin.com/in "${fullName}" "${school}"` : '', + school ? `site:linkedin.com/in "${firstName} ${lastName}" "${school}"` : '' + ].filter(Boolean); // Remove empty strings + + const chromeOptions = new Options(); + chromeOptions.addArguments( + '--headless=new', + '--disable-gpu', + '--no-sandbox', + '--disable-application-cache', + '--disable-extensions', + '--disable-notifications', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disk-cache-size=0', + '--media-cache-size=0', + '--aggressive-cache-discard' + ); + + const driver = await new Builder() + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + + try { + const allResults: any[] = []; + const seenUrls = new Set(); + + for (const query of queryVariations) { + console.log(`Searching with query: ${query}`); + const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5&${Date.now()}`; + + const data = await driver.executeAsyncScript(function (url: string, callback: Function) { + fetch(url) + .then(res => res.json()) + .then(result => callback(result)) + .catch(err => callback({ error: err.toString() })); + }, url) as SearchResult; + + const results = (data.items || []).filter((item: any) => + item.link.includes('linkedin.com/in') && !seenUrls.has(item.link) + ); + + for (const result of results) { + seenUrls.add(result.link); + allResults.push(result); + } + } + + if (allResults.length === 0) { + console.log('No LinkedIn profiles found.'); + } else { + console.log(`Top LinkedIn Matches for "${fullName}":`); + allResults.forEach((item, index) => { + console.log(`${index + 1}. ${item.link}`); + }); + } + } finally { + await driver.quit(); + } +} + +findLinkedinProfile(person).catch(console.error); \ No newline at end of file From c49951288fffc704236f73c871d4d770e616e649 Mon Sep 17 00:00:00 2001 From: barkat-10 Date: Wed, 7 May 2025 16:24:19 -0500 Subject: [PATCH 2/3] Enricher component done --- crawler/LinkedinScraper.ts | 450 +++++++++++++++++++++++++++++++++++++ crawler/enricher.ts | 209 ++++++++++------- 2 files changed, 575 insertions(+), 84 deletions(-) create mode 100644 crawler/LinkedinScraper.ts diff --git a/crawler/LinkedinScraper.ts b/crawler/LinkedinScraper.ts new file mode 100644 index 0000000..ee7a333 --- /dev/null +++ b/crawler/LinkedinScraper.ts @@ -0,0 +1,450 @@ +import { WebDriver } from 'selenium-webdriver'; +import * as fs from 'fs'; +import * as path from 'path'; +import { load } from 'cheerio'; +import type { CheerioAPI } from 'cheerio'; +import { Client } from 'pg'; + +const pagesDir = path.join(process.cwd(), 'enricher_pages'); +const jsonDir = path.join(process.cwd(), 'enricher_json'); + +export class EnricherLinkedInScraper { + private driver: WebDriver; + private maxRetries: number; + private retryDelay: number; + + constructor(driver: WebDriver, maxRetries = 5, retryDelay = 2000) { + this.driver = driver; + this.maxRetries = maxRetries; + this.retryDelay = retryDelay; + + // Ensure directories exist + if (!fs.existsSync(pagesDir)) fs.mkdirSync(pagesDir, { recursive: true }); + if (!fs.existsSync(jsonDir)) fs.mkdirSync(jsonDir, { recursive: true }); + } + + private async delay(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + private async simulateHumanBehavior(): Promise { + await this.driver.executeScript(`window.scrollBy(0, ${Math.floor(Math.random() * 300) + 100})`); + await this.delay(500 + Math.random() * 1000); + } + + public async scrapeAndStoreProfile(profileUrl: string): Promise { + try { + console.log(`Attempting to scrape: ${profileUrl}`); + + // Navigate to profile + await this.driver.get('https://www.google.com'); + await this.delay(1500); + await this.driver.executeScript(`window.location.href="${profileUrl}"`); + + // Check if page loaded properly + await this.delay(3000); + const currentUrl = await this.driver.getCurrentUrl(); + if (!currentUrl.includes('linkedin.com/in/')) { + throw new Error('Failed to load LinkedIn profile'); + } + + // Save HTML + await this.simulateHumanBehavior(); + const html = await this.driver.getPageSource(); + const profileId = profileUrl.split('/in/')[1].split('/')[0].split('?')[0]; + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `${profileId}_${timestamp}.html`; + + fs.writeFileSync(path.join(pagesDir, filename), html); + console.log(`Saved HTML: ${filename}`); + + // Convert to JSON and insert to DB + await this.convertToJson2(filename, profileUrl); + return true; + } catch (error) { + console.error(`Error scraping ${profileUrl}:`, error); + return false; + } + } + private getBestText($: CheerioAPI, selectors: string[]): string | null { + for (const selector of selectors) { + const text = $(selector).first().text().trim(); + if (text) return text; + } + return null; + } + + + + // This is just another method to try converting + + private async convertToJson2(htmlFilename: string, profileUrl: string): Promise { + const htmlPath = path.join(pagesDir, htmlFilename); + const profileId = htmlFilename.split('_')[0]; + const jsonPath = path.join(jsonDir, `${profileId}.json`); + + try { + const html = fs.readFileSync(htmlPath, 'utf-8'); + const $ = load(html); + + // 🔹 Robust selector sets + const fullName = this.getBestText($, [ + 'h1.top-card-layout__title', + 'h1.text-heading-xlarge', + 'h1', + ]); + + const jobTitle = this.getBestText($, [ + 'h2.top-card-layout__headline', + 'h2.text-body-medium', + 'h2', + ]); + + const locationText = this.getBestText($, [ + '.top-card-layout__first-subline span:first-child', + '.top-card__subline-item', + '.top-card-location', + ]); + + let city = null, state = null; + if (locationText) { + const parts = locationText.split(',').map(p => p.trim()); + [city, state] = [parts[0], parts[1]]; + } + + // 🔹 Education Section + let highSchool = null, hsGraduationYear = null; + let university: string | null = null; +let universityGradYear: string | null = null; +let degree: string | null = null; + let nafAcademy = null, nafTrackCertified = null; + + const educationSection = $('section[data-section="educationsDetails"], section:contains("Education")'); + if (educationSection.length === 0) { + console.warn(`No education section for ${htmlFilename}`); + } + + educationSection.find('li.education__list-item, li').each((_, el) => { + const textBlock = $(el).text().toLowerCase(); + const school = $(el).find('h3').first().text().trim(); + const degreeText = $(el).find('h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + const yearMatch = duration.match(/(\d{4})/); + const gradYear = yearMatch?.[1] ?? null; + + if (textBlock.includes('high school')) { + highSchool = school; + hsGraduationYear = gradYear; + } else { + university = university || school; + universityGradYear = universityGradYear || gradYear; + degree = degree || degreeText; + } + + if (school.toLowerCase().includes('academy of finance')) { + nafAcademy = school; + } + if (degreeText.toLowerCase().includes('naf track')) { + nafTrackCertified = degreeText; + } + }); + + // 🔹 Experience Section + let currentJob = null; + let internship_company1: string | null = null; + let internship_end_date1 = null; + let internship_company2: string | null = null; + let internship_end_date2 = null; + + const expSection = $('section[data-section="experience"], section:contains("Experience")'); + if (expSection.length === 0) { + console.warn(`No experience section for ${htmlFilename}`); + } + + expSection.find('ul > li').each((i, el) => { + const title = $(el).find('h3 span, h3').first().text().trim(); + const company = $(el).find('h4 span, h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); + const endDate = endDateMatch?.[0] ?? null; + + if (i === 0 && title && company) { + currentJob = `${title} at ${company}`; + } + + if (/intern(ship)?/i.test(title)) { + if (!internship_company1) { + internship_company1 = company; + internship_end_date1 = endDate; + } else if (!internship_company2) { + internship_company2 = company; + internship_end_date2 = endDate; + } + } + }); + + // 🔹 Build JSON + const dbData = { + profile_url: profileUrl, + timestamp: new Date().toISOString(), + full_name: fullName, + email: null, + phone_number: null, + high_school: highSchool, + hs_graduation_year: hsGraduationYear, + naf_academy: nafAcademy, + naf_track_certified: nafTrackCertified, + address: null, + city, + state, + zip_code: null, + birthdate: null, + gender: null, + ethnicity: null, + military_branch_served: null, + current_job: currentJob || jobTitle, + college_major: null, + university_grad_year: universityGradYear, + university, + degree, + linkedin_link: profileUrl, + school_district: null, + internship_company1, + internship_end_date1, + internship_company2, + internship_end_date2, + university2: null, + college_major2: null, + degree2: null + }; + + fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); + console.log(`Saved JSON: ${jsonPath}`); + + await this.insertToEnricherDatabase(dbData); + } catch (error) { + console.error(`Error converting ${htmlFilename}:`, error); + throw error; + } + } + + + private async convertToJson(htmlFilename: string, profileUrl: string): Promise { + const htmlPath = path.join(pagesDir, htmlFilename); + const profileId = htmlFilename.split('_')[0]; + const jsonPath = path.join(jsonDir, `${profileId}.json`); + + try { + const html = fs.readFileSync(htmlPath, 'utf-8'); + const $ = load(html); + + // Extract basic profile information - UPDATED SELECTORS + const fullName = this.getText($, 'h1.top-card-layout__title') || + $('h1').text().trim(); + + const jobTitle = this.getText($, 'h2.top-card-layout__headline') || + $('h2').text().trim(); + + const locationText = this.getText($, '.top-card-layout__first-subline > span:first-child') || + $('.top-card-location').text().trim(); + + // Parse location into city/state + let city = null; + let state = null; + if (locationText) { + const locationParts = locationText.split(', '); + city = locationParts[0] || null; + state = locationParts[1] || null; + } + + // Extract education information - UPDATED TO USE SECTIONS + let highSchool = null; + let hsGraduationYear = null; + let university = null; + let universityGradYear = null; + let degree = null; + let nafAcademy = null; + let nafTrackCertified = null; + + const educationSection = $('section[data-section="educationsDetails"]'); + if (educationSection.length > 0) { + educationSection.find('ul > li.education__list-item').each((_, el) => { + const school = $(el).find('h3').first().text().trim(); + const degreeText = $(el).find('h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + + const yearMatch = duration.match(/(\d{4})/); + const gradYear = yearMatch ? yearMatch[1] : null; + + if (school.toLowerCase().includes('high school')) { + highSchool = school; + hsGraduationYear = gradYear; + } else { + university = school; + universityGradYear = gradYear; + degree = degreeText; + } + + if (school.toLowerCase().includes('academy of finance')) { + nafAcademy = school; + } + if (degreeText.toLowerCase().includes('naf track')) { + nafTrackCertified = degreeText; + } + }); + } + + // Extract experience information - UPDATED TO USE SECTIONS + let currentJob = null; + let internshipCompany1: string | null = null; + let internship_end_date1 = null; + let internship_company2: string | null = null; + let internship_end_date2 = null; + + const expSection = $('section[data-section="experience"]'); + if (expSection.length > 0) { + expSection.find('ul > li').each((i, el) => { + const title = $(el).find('h3 span.experience-item__title').first().text().trim(); + const company = $(el).find('h4 span.experience-item__subtitle').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + + const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); + const endDate = endDateMatch ? endDateMatch[0] : null; + + if (i === 0) { + currentJob = title ? `${title} at ${company}` : company; + } + + if (title?.toLowerCase().includes('intern') || + title?.toLowerCase().includes('internship')) { + if (!internshipCompany1) { + internshipCompany1 = company; + internship_end_date1 = endDate; + } else if (!internship_company2) { + internship_company2 = company; + internship_end_date2 = endDate; + } + } + }); + } + + // Prepare data for database insertion + const dbData = { + profile_url: profileUrl, + timestamp: new Date().toISOString(), + full_name: fullName, + email: null, + phone_number: null, + high_school: highSchool, + hs_graduation_year: hsGraduationYear, + naf_academy: nafAcademy, + naf_track_certified: nafTrackCertified, + address: null, + city: city, + state: state, + zip_code: null, + birthdate: null, + gender: null, + ethnicity: null, + military_branch_served: null, + current_job: currentJob || jobTitle, + college_major: null, + university_grad_year: universityGradYear, + university: university, + degree: degree, + linkedin_link: profileUrl, + school_district: null, + internship_company1: internshipCompany1, + internship_end_date1: internship_end_date1, + internship_company2: internship_company2, + internship_end_date2: internship_end_date2, + university2: null, + college_major2: null, + degree2: null + }; + + // Save JSON + fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); + console.log(`Saved JSON: ${jsonPath}`); + + // Insert to database + await this.insertToEnricherDatabase(dbData); + } catch (error) { + console.error(`Error converting ${htmlFilename}:`, error); + throw error; + } + } + +// Add this helper function at class level +private getText($: CheerioAPI, selector: string): string | null { + const element = $(selector).first(); + return element.length ? element.text().trim() : null; +} + private async insertToEnricherDatabase(data: any): Promise { + const client = new Client({ + host: process.env.PGHOST, + port: parseInt(process.env.PGPORT || '5432'), + user: process.env.PGUSER, + password: process.env.PGPASSWORD, + database: process.env.PGDATABASE + }); + + try { + await client.connect(); + await client.query(` + INSERT INTO enricher_data ( + profile_url, timestamp, full_name, email, phone_number, + high_school, hs_graduation_year, naf_academy, naf_track_certified, + address, city, state, zip_code, birthdate, gender, ethnicity, + military_branch_served, current_job, college_major, university_grad_year, + university, degree, linkedin_link, school_district, + internship_company1, internship_end_date1, + internship_company2, internship_end_date2, + university2, college_major2, degree2 + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, + $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, + $29, $30, $31 + ) + `, [ + data.profile_url, + data.timestamp, + data.full_name, + data.email, + data.phone_number, + data.high_school, + data.hs_graduation_year, + data.naf_academy, + data.naf_track_certified, + data.address, + data.city, + data.state, + data.zip_code, + data.birthdate, + data.gender, + data.ethnicity, + data.military_branch_served, + data.current_job, + data.college_major, + data.university_grad_year, + data.university, + data.degree, + data.linkedin_link, + data.school_district, + data.internship_company1, + data.internship_end_date1, + data.internship_company2, + data.internship_end_date2, + data.university2, + data.college_major2, + data.degree2 + ]); + console.log('Data inserted into enricher_data'); + } catch (error) { + console.error('Error inserting into enricher_data:', error); + throw error; + } finally { + await client.end(); + } + } +} \ No newline at end of file diff --git a/crawler/enricher.ts b/crawler/enricher.ts index 90cf9f3..62747c7 100644 --- a/crawler/enricher.ts +++ b/crawler/enricher.ts @@ -3,6 +3,10 @@ import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome'; //we are sup import chromedriver from 'chromedriver'; import { Client } from 'pg'; import 'dotenv/config'; // get the dot env file +import { EnricherLinkedInScraper } from './LinkedinScraper'; +import { ProfileData } from './html_json'; +import * as path from 'path'; +import * as fs from 'fs'; const client = new Client({ host: process.env.PGHOST, @@ -11,8 +15,8 @@ const client = new Client({ password: process.env.PGPASSWORD, database: process.env.PGDATABASE }); - - async function main(){ +// this is the function to try finding the person in cralwer database. + async function findInDb(){ try{ await client.connect(); @@ -74,7 +78,7 @@ const client = new Client({ data.internshipCompany1 // internship_company1 ]); - console.log(`Successfully enriched data for ${fullName}`); + console.log(`Successfully enriched data for ${fullName}`); } catch (err) { @@ -85,107 +89,144 @@ const client = new Client({ } } - main(); + findInDb(); + + -const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); -const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. -interface Person{ - fullName: String; + const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); + const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. + + interface Person { + fullName: string; email?: string; phone?: string; school?: string; -} + currentJob?: string; + city?: string; + highSchool?: string; + degree?: string; + } -interface SearchResult { + interface SearchResult { items?: { link: string }[]; error?: any; } -const person: Person = { - fullName: 'Shahreen Iqbal', - email: 'singhbarkat1011@gmail.com', - phone: '', - school: 'University Of Texas, Dallas', -} + /* */ -async function findLinkedinProfile(person: Person): Promise { - const { fullName, school } = person; - const [firstName, ...lastNameParts] = fullName.split(" "); - const lastName = lastNameParts.join(" "); + + async function findLinkedinProfile(person: Person): Promise { + const { fullName, school, currentJob, city, highSchool, degree } = person; + const apiKey = process.env.API_KEY; const searchEngineId = process.env.SEARCH_ENGINE_ID; - + if (!apiKey || !searchEngineId) { - console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); - process.exit(1); + console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); + process.exit(1); } + + let queryParts = [`site:linkedin.com/in "${fullName}"`]; + if (school) queryParts.push(school); + if (currentJob) queryParts.push(currentJob); + if (city) queryParts.push(city); + if (highSchool) queryParts.push(highSchool); + if (degree) queryParts.push(degree); - const queryVariations = [ - `site:linkedin.com/in "${fullName}"`, - `site:linkedin.com/in "${firstName} ${lastName}"`, - school ? `site:linkedin.com/in "${fullName}" "${school}"` : '', - school ? `site:linkedin.com/in "${firstName} ${lastName}" "${school}"` : '' - ].filter(Boolean); // Remove empty strings +const query = queryParts.join(" "); + + const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5`; + const chromeOptions = new Options(); - chromeOptions.addArguments( - '--headless=new', - '--disable-gpu', - '--no-sandbox', - '--disable-application-cache', - '--disable-extensions', - '--disable-notifications', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disk-cache-size=0', - '--media-cache-size=0', - '--aggressive-cache-discard' - ); - + chromeOptions.addArguments('--headless=new', '--disable-gpu', '--no-sandbox'); + const driver = await new Builder() - .forBrowser('chrome') - .setChromeOptions(chromeOptions) - .setChromeService(serviceBuilder) - .build(); - + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + try { - const allResults: any[] = []; - const seenUrls = new Set(); - - for (const query of queryVariations) { - console.log(`Searching with query: ${query}`); - const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5&${Date.now()}`; - - const data = await driver.executeAsyncScript(function (url: string, callback: Function) { - fetch(url) - .then(res => res.json()) - .then(result => callback(result)) - .catch(err => callback({ error: err.toString() })); - }, url) as SearchResult; - - const results = (data.items || []).filter((item: any) => - item.link.includes('linkedin.com/in') && !seenUrls.has(item.link) - ); - - for (const result of results) { - seenUrls.add(result.link); - allResults.push(result); - } - } - - if (allResults.length === 0) { - console.log('No LinkedIn profiles found.'); - } else { - console.log(`Top LinkedIn Matches for "${fullName}":`); - allResults.forEach((item, index) => { - console.log(`${index + 1}. ${item.link}`); - }); - } + const data = await driver.executeAsyncScript(function (url: string, callback: Function) { + fetch(url) + .then(res => res.json()) + .then(result => callback(result)) + .catch(err => callback({ error: err.toString() })); + }, url) as SearchResult; + + const results = (data.items || []).filter(item => + item.link.includes('linkedin.com/in') + ); + + if (results.length === 0) { + console.log(`No LinkedIn profiles found for "${fullName}".`); + return null; + } else { + console.log(`Top LinkedIn Matches for "${fullName}":`); + results.forEach((item, index) => { + console.log(`${index + 1}. ${item.link}`); + }); + const firstProfileUrl = results[0].link; + + return firstProfileUrl; + } } finally { - await driver.quit(); + await driver.quit(); } -} - -findLinkedinProfile(person).catch(console.error); \ No newline at end of file + + } + + async function findOnLinkedin() { + try { + + + // 👇 change this to the person you want to test + const fullName = 'Luke Edwards'; + + const res = await client.query( + `SELECT json FROM crawler_data WHERE json->>'fullName' = $1 LIMIT 1`, + [fullName] + ); + + if (res.rows.length === 0) { + console.log(`No data found for ${fullName}`); + return; + } + + const data = res.rows[0].json; + + const person: Person = { + fullName: data.fullName, + email: data.email, + phone: data.phoneNumber, + school: data.university || data.highSchool, + currentJob: data.currentJob, + city: data.city, + highSchool: data.highSchool, + degree: data.degree, + }; + + const profileUrl = await findLinkedinProfile(person); + if (!profileUrl) return; + // Initialize scraper and scrape the profile + const chromeOptions = new Options(); + chromeOptions.addArguments('--headless=new', '--disable-gpu', '--no-sandbox'); + const driver = await new Builder() + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + + const scraper = new EnricherLinkedInScraper(driver); + await scraper.scrapeAndStoreProfile(profileUrl); + } catch (err) { + console.error('Error:', err); + } finally { + await client.end(); + } + } + + findOnLinkedin().catch(console.error); \ No newline at end of file From 6977e7326a669388206d200dac7fbe56c4621b55 Mon Sep 17 00:00:00 2001 From: barkat-10 <141375614+barkat-10@users.noreply.github.com> Date: Mon, 12 May 2025 19:07:21 -0500 Subject: [PATCH 3/3] Updated LinkedinScraper.ts --- crawler/LinkedinScraper.ts | 451 ++++++++++++++++--------------------- 1 file changed, 194 insertions(+), 257 deletions(-) diff --git a/crawler/LinkedinScraper.ts b/crawler/LinkedinScraper.ts index ee7a333..44f89c2 100644 --- a/crawler/LinkedinScraper.ts +++ b/crawler/LinkedinScraper.ts @@ -4,6 +4,7 @@ import * as path from 'path'; import { load } from 'cheerio'; import type { CheerioAPI } from 'cheerio'; import { Client } from 'pg'; +import { Cheerio, Element } from 'cheerio'; const pagesDir = path.join(process.cwd(), 'enricher_pages'); const jsonDir = path.join(process.cwd(), 'enricher_json'); @@ -23,6 +24,8 @@ export class EnricherLinkedInScraper { if (!fs.existsSync(jsonDir)) fs.mkdirSync(jsonDir, { recursive: true }); } + +//add random delay private async delay(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } @@ -59,26 +62,16 @@ export class EnricherLinkedInScraper { console.log(`Saved HTML: ${filename}`); // Convert to JSON and insert to DB - await this.convertToJson2(filename, profileUrl); + await this.convertToJson(filename, profileUrl); return true; } catch (error) { console.error(`Error scraping ${profileUrl}:`, error); return false; } } - private getBestText($: CheerioAPI, selectors: string[]): string | null { - for (const selector of selectors) { - const text = $(selector).first().text().trim(); - if (text) return text; - } - return null; - } - - // This is just another method to try converting - - private async convertToJson2(htmlFilename: string, profileUrl: string): Promise { + private async convertToJson(htmlFilename: string, profileUrl: string): Promise { const htmlPath = path.join(pagesDir, htmlFilename); const profileId = htmlFilename.split('_')[0]; const jsonPath = path.join(jsonDir, `${profileId}.json`); @@ -86,262 +79,208 @@ export class EnricherLinkedInScraper { try { const html = fs.readFileSync(htmlPath, 'utf-8'); const $ = load(html); - - // 🔹 Robust selector sets - const fullName = this.getBestText($, [ - 'h1.top-card-layout__title', - 'h1.text-heading-xlarge', - 'h1', - ]); - - const jobTitle = this.getBestText($, [ - 'h2.top-card-layout__headline', - 'h2.text-body-medium', - 'h2', - ]); - - const locationText = this.getBestText($, [ - '.top-card-layout__first-subline span:first-child', - '.top-card__subline-item', - '.top-card-location', - ]); - - let city = null, state = null; - if (locationText) { - const parts = locationText.split(',').map(p => p.trim()); - [city, state] = [parts[0], parts[1]]; - } - - // 🔹 Education Section - let highSchool = null, hsGraduationYear = null; - let university: string | null = null; -let universityGradYear: string | null = null; -let degree: string | null = null; - let nafAcademy = null, nafTrackCertified = null; - - const educationSection = $('section[data-section="educationsDetails"], section:contains("Education")'); - if (educationSection.length === 0) { - console.warn(`No education section for ${htmlFilename}`); - } - - educationSection.find('li.education__list-item, li').each((_, el) => { - const textBlock = $(el).text().toLowerCase(); - const school = $(el).find('h3').first().text().trim(); - const degreeText = $(el).find('h4').first().text().trim(); - const duration = $(el).find('span.date-range').first().text().trim(); - const yearMatch = duration.match(/(\d{4})/); - const gradYear = yearMatch?.[1] ?? null; - - if (textBlock.includes('high school')) { - highSchool = school; - hsGraduationYear = gradYear; - } else { - university = university || school; - universityGradYear = universityGradYear || gradYear; - degree = degree || degreeText; - } - - if (school.toLowerCase().includes('academy of finance')) { - nafAcademy = school; - } - if (degreeText.toLowerCase().includes('naf track')) { - nafTrackCertified = degreeText; - } - }); - - // 🔹 Experience Section - let currentJob = null; - let internship_company1: string | null = null; - let internship_end_date1 = null; - let internship_company2: string | null = null; - let internship_end_date2 = null; - - const expSection = $('section[data-section="experience"], section:contains("Experience")'); - if (expSection.length === 0) { - console.warn(`No experience section for ${htmlFilename}`); - } - - expSection.find('ul > li').each((i, el) => { - const title = $(el).find('h3 span, h3').first().text().trim(); - const company = $(el).find('h4 span, h4').first().text().trim(); - const duration = $(el).find('span.date-range').first().text().trim(); - const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); - const endDate = endDateMatch?.[0] ?? null; - - if (i === 0 && title && company) { - currentJob = `${title} at ${company}`; - } - - if (/intern(ship)?/i.test(title)) { - if (!internship_company1) { - internship_company1 = company; - internship_end_date1 = endDate; - } else if (!internship_company2) { - internship_company2 = company; - internship_end_date2 = endDate; - } - } - }); - - // 🔹 Build JSON - const dbData = { - profile_url: profileUrl, - timestamp: new Date().toISOString(), - full_name: fullName, - email: null, - phone_number: null, - high_school: highSchool, - hs_graduation_year: hsGraduationYear, - naf_academy: nafAcademy, - naf_track_certified: nafTrackCertified, - address: null, - city, - state, - zip_code: null, - birthdate: null, - gender: null, - ethnicity: null, - military_branch_served: null, - current_job: currentJob || jobTitle, - college_major: null, - university_grad_year: universityGradYear, - university, - degree, - linkedin_link: profileUrl, - school_district: null, - internship_company1, - internship_end_date1, - internship_company2, - internship_end_date2, - university2: null, - college_major2: null, - degree2: null + + // 🔹 Name Extraction (with meta tag fallback) + const getMeta = ($: CheerioAPI, sel: string): string | null => { + return $(sel).attr('content')?.trim() ?? null; }; - - fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); - console.log(`Saved JSON: ${jsonPath}`); - - await this.insertToEnricherDatabase(dbData); - } catch (error) { - console.error(`Error converting ${htmlFilename}:`, error); - throw error; - } - } - - private async convertToJson(htmlFilename: string, profileUrl: string): Promise { - const htmlPath = path.join(pagesDir, htmlFilename); - const profileId = htmlFilename.split('_')[0]; - const jsonPath = path.join(jsonDir, `${profileId}.json`); + const getText = ($: CheerioAPI, primarySelector: string): string | null => { + let text: string | null = null; + const primaryElement = $(primarySelector).first(); + if (primaryElement.length) { + text = primaryElement.text().trim(); + } + return text || null; + }; - try { - const html = fs.readFileSync(htmlPath, 'utf-8'); - const $ = load(html); + const getJsonLd = ($: CheerioAPI, rx: RegExp): string | null => { + const script = $('script[type="application/ld+json"]').html(); + if (!script) return null; + const m = script.match(rx); + return m?.[1]?.trim() ?? null; + }; + const cleanText = (text: string | null): string | null => { + if (!text) return null; + return text.replace(/[\n\s]+/g, ' ').trim(); + }; - // Extract basic profile information - UPDATED SELECTORS - const fullName = this.getText($, 'h1.top-card-layout__title') || - $('h1').text().trim(); - - const jobTitle = this.getText($, 'h2.top-card-layout__headline') || - $('h2').text().trim(); - - const locationText = this.getText($, '.top-card-layout__first-subline > span:first-child') || - $('.top-card-location').text().trim(); - - // Parse location into city/state - let city = null; - let state = null; - if (locationText) { - const locationParts = locationText.split(', '); - city = locationParts[0] || null; - state = locationParts[1] || null; + // 🔹 Name Extraction + let fullName: string | null = null; + const firstNameMeta = getMeta($, 'meta[property="profile:first_name"]'); + const lastNameMeta = getMeta($, 'meta[property="profile:last_name"]'); + if (firstNameMeta && lastNameMeta) { + fullName = `${firstNameMeta} ${lastNameMeta}`; + } else { + fullName = firstNameMeta || lastNameMeta || getText($, 'h1.top-card-layout__title'); } - // Extract education information - UPDATED TO USE SECTIONS - let highSchool = null; - let hsGraduationYear = null; - let university = null; - let universityGradYear = null; - let degree = null; - let nafAcademy = null; - let nafTrackCertified = null; + // Job Title and Location + const jobTitleOg = getMeta($, 'meta[property="og:title"]'); + const headlineH2 = getText($, 'h2.top-card-layout__headline'); + const jobTitle = jobTitleOg || headlineH2; + const locationVisible = getText($, '.top-card-layout__first-subline > span:first-child'); + const locationJsonLd = getJsonLd($, /"addressLocality":"(.*?)"/); + const location = locationVisible || locationJsonLd; + const city = locationJsonLd; + const state = null; // Not extracted in original, can be added if needed + const linkedinLink = getMeta($, 'meta[property="og:url"]'); + const email = $('a[href^="mailto:"]').attr('href')?.replace('mailto:', '').trim() ?? null; + const phoneNumber = getText($, 'span.phone'); + // Extract Education Data + let highSchool: string | null = null; + let hsGraduationYear: string | null = null; + let university: string | null = null; + let degree: string | null = null; + let universityGradYear: string | null = null; const educationSection = $('section[data-section="educationsDetails"]'); if (educationSection.length > 0) { - educationSection.find('ul > li.education__list-item').each((_, el) => { - const school = $(el).find('h3').first().text().trim(); - const degreeText = $(el).find('h4').first().text().trim(); - const duration = $(el).find('span.date-range').first().text().trim(); - - const yearMatch = duration.match(/(\d{4})/); - const gradYear = yearMatch ? yearMatch[1] : null; - - if (school.toLowerCase().includes('high school')) { - highSchool = school; - hsGraduationYear = gradYear; - } else { - university = school; - universityGradYear = gradYear; - degree = degreeText; - } + educationSection.find('ul > li.education__list-item').each((_, el) => { + const schoolNameElement = $(el).find('h3 a').first().length ? $(el).find('h3 a').first() : $(el).find('h3').first(); + const schoolName = cleanText(schoolNameElement.text()); + const degreeMajorElement = $(el).find('h4').first(); + const degreeMajorText = cleanText(degreeMajorElement.text()); + const dateRangeElement = $(el).find('span.date-range').first(); + const dateRangeText = cleanText(dateRangeElement.text()); - if (school.toLowerCase().includes('academy of finance')) { - nafAcademy = school; - } - if (degreeText.toLowerCase().includes('naf track')) { - nafTrackCertified = degreeText; - } - }); + + if (schoolName?.toLowerCase().includes('high school')) { + if (!highSchool) { + highSchool = schoolName; + if (dateRangeText) { + const yearMatch = dateRangeText.match(/(\d{4})\s*$/); + hsGraduationYear = yearMatch ? yearMatch[1] : null; + } + } + } else { + if (!university) { + university = schoolName; + if (degreeMajorText) { + const parts = degreeMajorText.split(',').map(p => p.trim()); + degree = parts[0]?.replace(/[\n\s]+/g, ' ').trim() || null; + } + if (dateRangeText) { + const yearMatch = dateRangeText.match(/(\d{4})\s*$/); + universityGradYear = yearMatch ? yearMatch[1] : null; + } + } + } + }); } + - // Extract experience information - UPDATED TO USE SECTIONS - let currentJob = null; - let internshipCompany1: string | null = null; - let internship_end_date1 = null; - let internship_company2: string | null = null; - let internship_end_date2 = null; + // NAF Involvement + let nafAcademy: string | null = null; + let nafTrackCertified: string | null = null; + const certSection = $('section[data-section="certifications"]'); + if (certSection.length > 0) { + certSection.find('ul > li').each((_, el) => { + const certName = cleanText($(el).find('h3').first().text()); + const issuerName = cleanText($(el).find('h4 a').first().text()); + + if (!nafTrackCertified && certName?.toLowerCase().includes('naftrack')) { + nafTrackCertified = certName; + } + if (!nafAcademy && certName?.toLowerCase().includes('academy of finance')) { + nafAcademy = certName; + } + if (issuerName?.toLowerCase() === 'naf') { + if (!nafTrackCertified) nafTrackCertified = certName ?? "NAF Issued Certification"; + if (!nafAcademy) nafAcademy = certName ?? "NAF Issued Certification"; + } + }); + } + const orgSection = $('section[data-section="organizations"]'); + if (!nafAcademy && orgSection.length > 0) { + orgSection.find('ul > li').each((_, el) => { + const orgName = cleanText($(el).find('h3').first().text()); + if (orgName?.toLowerCase().includes('academy of finance')) { + nafAcademy = orgName; + return false; + } + }); + } + if (!nafAcademy && educationSection.length > 0) { + educationSection.find('ul > li.education__list-item').each((_, el) => { + const description = cleanText($(el).find('div[data-section="educations"] p').first().text()); + if (description?.toLowerCase().includes('academy of finance')) { + nafAcademy = "Academy of Finance (Mentioned in Education)"; + return false; + } + }); + } + // Current Job + let currentJob: string | null = null; const expSection = $('section[data-section="experience"]'); if (expSection.length > 0) { - expSection.find('ul > li').each((i, el) => { - const title = $(el).find('h3 span.experience-item__title').first().text().trim(); - const company = $(el).find('h4 span.experience-item__subtitle').first().text().trim(); - const duration = $(el).find('span.date-range').first().text().trim(); - - const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); - const endDate = endDateMatch ? endDateMatch[0] : null; - - if (i === 0) { - currentJob = title ? `${title} at ${company}` : company; + const firstExperienceItem = expSection.find('ul.experience__list > li').first(); + const firstExpPosition = firstExperienceItem.hasClass('experience-group') + ? firstExperienceItem.find('ul.experience-group__positions > li').first() + : firstExperienceItem; + if (firstExpPosition.length > 0) { + const title = cleanText($(firstExpPosition).find('h3 span.experience-item__title').first().text()); + const company = cleanText($(firstExpPosition).find('h4 span.experience-item__subtitle').first().text()); + currentJob = title && company ? `${title} at ${company}` : title || company; } + } + currentJob = currentJob || headlineH2; - if (title?.toLowerCase().includes('intern') || - title?.toLowerCase().includes('internship')) { - if (!internshipCompany1) { - internshipCompany1 = company; - internship_end_date1 = endDate; - } else if (!internship_company2) { - internship_company2 = company; - internship_end_date2 = endDate; - } - } - }); + // Internship Data + let internship_company1: string | null = null; + let internship_end_date1: string | null = null; + let internship_company2: string | null = null; + let internship_end_date2: string | null = null; + if (expSection.length > 0) { + expSection.find('ul > li').each((_, el) => { + const title = $(el).find('h3 span.experience-item__title').first().text().trim() || null; + const companyElement = $(el).find('h4 span.experience-item__subtitle').first(); + const dateElement = $(el).find('span.date-range').first(); + + if (title?.toLowerCase().includes('intern') || title?.toLowerCase().includes('analyst')) { + const durationText = cleanText(dateElement.text()); + if (durationText && (durationText.includes('mos') || /^\d+\s+yr(s)?$/.test(durationText) || /^\d{1,2}\s+mo(s)?$/.test(durationText))) { + if (!internship_company1) { + internship_company1 = companyElement.text().trim() || null; + if (durationText) { + const endDateMatch = durationText.match(/–\s*(\w+\s+\d{4}|\d{4})/); + internship_end_date1 = endDateMatch ? endDateMatch[1] : null; + if (!internship_end_date1) { + const singleDateMatch = durationText.match(/(\w+\s+\d{4}|\d{4})/); + internship_end_date1 = singleDateMatch ? singleDateMatch[1] : null; + } + } + } else if (!internship_company2) { + internship_company2 = companyElement.text().trim() || null; + if (durationText) { + const endDateMatch = durationText.match(/–\s*(\w+\s+\d{4}|\d{4})/); + internship_end_date2 = endDateMatch ? endDateMatch[1] : null; + if (!internship_end_date2) { + const singleDateMatch = durationText.match(/(\w+\s+\d{4}|\d{4})/); + internship_end_date2 = singleDateMatch ? singleDateMatch[1] : null; + } + } + } + } + } + }); } - // Prepare data for database insertion + // 🔹 Build JSON (maintaining your exact structure) const dbData = { profile_url: profileUrl, timestamp: new Date().toISOString(), full_name: fullName, - email: null, - phone_number: null, + email, + phone_number: phoneNumber, high_school: highSchool, hs_graduation_year: hsGraduationYear, naf_academy: nafAcademy, naf_track_certified: nafTrackCertified, address: null, - city: city, - state: state, + city, + state, zip_code: null, birthdate: null, gender: null, @@ -350,36 +289,34 @@ let degree: string | null = null; current_job: currentJob || jobTitle, college_major: null, university_grad_year: universityGradYear, - university: university, - degree: degree, - linkedin_link: profileUrl, + university, + degree, + linkedin_link: linkedinLink, school_district: null, - internship_company1: internshipCompany1, - internship_end_date1: internship_end_date1, - internship_company2: internship_company2, - internship_end_date2: internship_end_date2, + internship_company1, + internship_end_date1, + internship_company2, + internship_end_date2, university2: null, college_major2: null, degree2: null }; - // Save JSON fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); console.log(`Saved JSON: ${jsonPath}`); - // Insert to database await this.insertToEnricherDatabase(dbData); } catch (error) { console.error(`Error converting ${htmlFilename}:`, error); - throw error; + // Optionally: throw error; // Keep if you want failures to propagate } - } + } + + + + + -// Add this helper function at class level -private getText($: CheerioAPI, selector: string): string | null { - const element = $(selector).first(); - return element.length ? element.text().trim() : null; -} private async insertToEnricherDatabase(data: any): Promise { const client = new Client({ host: process.env.PGHOST, @@ -447,4 +384,4 @@ private getText($: CheerioAPI, selector: string): string | null { await client.end(); } } -} \ No newline at end of file +}