-
Notifications
You must be signed in to change notification settings - Fork 13
Preflight canonical url #2012
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Preflight canonical url #2012
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,20 @@ import { isAuthUrl } from '../support/utils.js'; | |
| const auditType = Audit.AUDIT_TYPES.CANONICAL; | ||
| const { AUDIT_STEP_DESTINATIONS } = Audit; | ||
|
|
||
| /** | ||
| * Check if canonical is self-referenced (ignoring protocol and case) | ||
| * | ||
| */ | ||
| export function normalizeUrlExport(url) { | ||
| try { | ||
| const urlObj = new URL(url); | ||
| // Remove protocol, domain, query params, and hash; lowercase; keep only pathname | ||
| return urlObj.pathname.toLowerCase(); | ||
| } catch { | ||
| return url.toLowerCase(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Step 1: Import top pages (used in multi-step audit) | ||
| */ | ||
|
|
@@ -325,6 +339,161 @@ export async function validateCanonicalRecursively( | |
| return checks; | ||
| } | ||
|
|
||
| /* | ||
| * check if a scraped object have valid canonical url tag | ||
| */ | ||
| export async function validateCanonicalTag(scrapedObject, context, key = '') { | ||
| const { | ||
| site, log, | ||
| } = context; | ||
| const baseURL = site.getBaseURL(); | ||
|
|
||
| try { | ||
| if (!scrapedObject?.scrapeResult?.canonical) { | ||
| log.warn(`[canonical] No canonical metadata in S3 object: ${key}`); | ||
| return { url: null, canonicalUrl: null, checks: [] }; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is no canonical scrape result here, shouldn't this be flagged as a something like CANONICAL_CHECKS.CANONICAL_TAG_MISSING and not silently returned? |
||
| } | ||
|
|
||
| const url = scrapedObject.url || scrapedObject.finalUrl; | ||
| if (!url) { | ||
| log.warn(`[canonical] No URL found in S3 object: ${key}`); | ||
| return { url: null, canonicalUrl: null, checks: [] }; | ||
| } | ||
|
|
||
| const finalUrl = scrapedObject.finalUrl || url; | ||
|
Comment on lines
+357
to
+363
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If scrapedObject.url exists we set it to url, but then on line 363 we flip it and say if scrapedObject.finalUrl is available we us it over the line on 357. This is a bit confusing. The logic has a subtle redundancy and could be misleading: If scrapedObject.url is missing but scrapedObject.finalUrl exists, then url and finalUrl will be the same value (both set to scrapedObject.finalUrl). The finalUrl fallback on line 363 becomes a no-op, and more importantly, the semantic distinction between "the URL we requested" and "the URL we landed on" is lost. This preserves the semantic distinction: url is always the original requested URL, and finalUrl is the resolved destination (defaulting to url when no redirect occurred). If scrapedObject.url is genuinely absent, it's a real data problem worth rejecting cleanly — not silently masking by substituting finalUrl. |
||
|
|
||
| // Filter out scraped pages that redirected to auth/login pages or PDFs | ||
| // This prevents false positives when a legitimate page redirects to login | ||
| if (isAuthUrl(finalUrl)) { | ||
| log.info(`[canonical] Skipping ${url} - redirected to auth page: ${finalUrl}`); | ||
| return { url: null, canonicalUrl: null, checks: [] }; | ||
| } | ||
| if (isPdfUrl(finalUrl)) { | ||
| log.info(`[canonical] Skipping ${url} - redirected to PDF: ${finalUrl}`); | ||
| return { url: null, canonicalUrl: null, checks: [] }; | ||
| } | ||
|
|
||
| const isPreview = isPreviewPage(baseURL); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking isPreview here and using it way down in the code, it would be cleaner if we move this check closer to where isPreview is used. |
||
|
|
||
| // Use canonical metadata already extracted by the scraper (Puppeteer) | ||
| const canonicalMetadata = scrapedObject.scrapeResult.canonical; | ||
| const canonicalUrl = canonicalMetadata.href || null; | ||
| const canonicalTagChecks = []; | ||
|
|
||
| // Check if canonical tag exists | ||
| if (!canonicalMetadata.exists || !canonicalUrl) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this might need to be broken down into two checks as we might have metadata, but the url is missing and we are returning a response saying the tag is missing when it isn't. This requires a dedicated CANONICAL_TAG_NO_HREF check constant, but it surfaces a genuinely distinct failure mode rather than silently mislabeling a malformed tag as a missing one. |
||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation, | ||
| }); | ||
| } else { | ||
| // Canonical tag exists | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check, | ||
| success: true, | ||
| }); | ||
|
|
||
| // Check if canonical is in <head> | ||
| if (!canonicalMetadata.inHead) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| // Check if there are multiple canonical tags | ||
| if (canonicalMetadata.count > 1) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| // Check if canonical is nonempty | ||
| if (!canonicalUrl || canonicalUrl.trim() === '') { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| const normalizedCanonical = normalizeUrlExport(canonicalUrl); | ||
| const normalizedFinal = normalizeUrlExport(finalUrl); | ||
|
Comment on lines
+439
to
+440
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove the normalizeUrlExport function and create a new function called isSelfReferencing that takes the two URLs and validates if they are the same url. This would be a cleaner approach isolating the logic of what validates two urls are the same. |
||
|
|
||
| // Canonical should match the final URL (what was actually served) | ||
| const isSelfReferenced = normalizedCanonical === normalizedFinal; | ||
| if (isSelfReferenced) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| success: true, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| const checks = [...canonicalTagChecks]; | ||
|
|
||
| if (canonicalUrl) { | ||
| const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log, isPreview); | ||
| checks.push(...urlFormatChecks); | ||
|
|
||
| // self-reference check | ||
| const selfRefCheck = canonicalTagChecks.find( | ||
| (c) => c.check === CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| ); | ||
| const isSelfReferenced = selfRefCheck?.success === true; | ||
|
|
||
| // if self-referenced - skip accessibility | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reading this comment had me think that we are skipping an accessibility audit, not the fact that the url was a 200. |
||
| if (isSelfReferenced) { | ||
| checks.push({ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally we have a set of checks that pass when the url is valid, this allows for future conditions to be added with easy and maintainability: And then we could: |
||
| check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check, | ||
| success: true, | ||
| }); | ||
| checks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check, | ||
| success: true, | ||
| }); | ||
| } else { | ||
| // if not self-referenced - validate accessibility | ||
|
|
||
| const options = await getPreviewAuthOptions(isPreview, baseURL, site, context, log); | ||
|
|
||
| const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log, options); | ||
| checks.push(...urlContentCheck); | ||
| } | ||
| } | ||
|
|
||
| return { url, canonicalUrl, checks }; | ||
|
Comment on lines
+481
to
+490
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Formatting is required. |
||
| } catch (error) { | ||
| log.error(`[canonical] Error processing scraped content from ${key}: ${error.message}`); | ||
| return { url: null, canonicalUrl: null, checks: [] }; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Define a default object here that can be used throughout the code base that has these values and return it. |
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Generates a suggestion for fixing a canonical issue based on the check type. | ||
| * | ||
|
|
@@ -391,155 +560,11 @@ export async function processScrapedContent(context) { | |
| return null; | ||
| } | ||
|
|
||
| if (!scrapedObject?.scrapeResult?.canonical) { | ||
| log.warn(`[canonical] No canonical metadata in S3 object: ${key}`); | ||
| return null; | ||
| } | ||
|
|
||
| const url = scrapedObject.url || scrapedObject.finalUrl; | ||
| if (!url) { | ||
| log.warn(`[canonical] No URL found in S3 object: ${key}`); | ||
| const result = await validateCanonicalTag(scrapedObject, context, key); | ||
| if (!result.url) { | ||
| return null; | ||
| } | ||
|
|
||
| const finalUrl = scrapedObject.finalUrl || url; | ||
|
|
||
| // Filter out scraped pages that redirected to auth/login pages or PDFs | ||
| // This prevents false positives when a legitimate page redirects to login | ||
| if (isAuthUrl(finalUrl)) { | ||
| log.info(`[canonical] Skipping ${url} - redirected to auth page: ${finalUrl}`); | ||
| return null; | ||
| } | ||
| if (isPdfUrl(finalUrl)) { | ||
| log.info(`[canonical] Skipping ${url} - redirected to PDF: ${finalUrl}`); | ||
| return null; | ||
| } | ||
|
|
||
| const isPreview = isPreviewPage(baseURL); | ||
|
|
||
| // Use canonical metadata already extracted by the scraper (Puppeteer) | ||
| const canonicalMetadata = scrapedObject.scrapeResult.canonical; | ||
| const canonicalUrl = canonicalMetadata.href || null; | ||
| const canonicalTagChecks = []; | ||
|
|
||
| // Check if canonical tag exists | ||
| if (!canonicalMetadata.exists || !canonicalUrl) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation, | ||
| }); | ||
| } else { | ||
| // Canonical tag exists | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check, | ||
| success: true, | ||
| }); | ||
|
|
||
| // Check if canonical is in <head> | ||
| if (!canonicalMetadata.inHead) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| // Check if there are multiple canonical tags | ||
| if (canonicalMetadata.count > 1) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| // Check if canonical is nonempty | ||
| if (!canonicalUrl || canonicalUrl.trim() === '') { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.explanation, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check, | ||
| success: true, | ||
| }); | ||
| } | ||
|
|
||
| // Check if canonical is self-referenced (ignoring protocol, domain, query, hash, case) | ||
| const normalizeUrl = (u) => { | ||
| try { | ||
| const urlObj = new URL(u); | ||
| // Remove protocol, domain, query params, and hash; lowercase; keep only pathname | ||
| return urlObj.pathname.toLowerCase(); | ||
| } catch { | ||
| return u.toLowerCase(); | ||
| } | ||
| }; | ||
| const normalizedCanonical = normalizeUrl(canonicalUrl); | ||
| const normalizedFinal = normalizeUrl(finalUrl); | ||
|
|
||
| // Canonical should match the final URL path (what was actually served) | ||
| const isSelfReferenced = normalizedCanonical === normalizedFinal; | ||
| if (isSelfReferenced) { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| success: true, | ||
| }); | ||
| } else { | ||
| canonicalTagChecks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| success: false, | ||
| explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| const checks = [...canonicalTagChecks]; | ||
|
|
||
| if (canonicalUrl) { | ||
| const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log, isPreview); | ||
| checks.push(...urlFormatChecks); | ||
|
|
||
| // self-reference check | ||
| const selfRefCheck = canonicalTagChecks.find( | ||
| (c) => c.check === CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, | ||
| ); | ||
| const isSelfReferenced = selfRefCheck?.success === true; | ||
|
|
||
| // if self-referenced - skip accessibility | ||
| if (isSelfReferenced) { | ||
| checks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check, | ||
| success: true, | ||
| }); | ||
| checks.push({ | ||
| check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check, | ||
| success: true, | ||
| }); | ||
| } else { | ||
| // if not self-referenced - validate accessibility | ||
|
|
||
| const options = await getPreviewAuthOptions(isPreview, baseURL, site, context, log); | ||
|
|
||
| const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log, options); | ||
| checks.push(...urlContentCheck); | ||
| } | ||
| } | ||
|
|
||
| return { url, checks }; | ||
| return { url: result.url, checks: result.checks }; | ||
| } catch (error) { | ||
| log.error(`[canonical] Error processing scraped content from ${key}: ${error.message}`); | ||
| return null; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.