Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 172 additions & 147 deletions src/canonical/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ import { isAuthUrl } from '../support/utils.js';
const auditType = Audit.AUDIT_TYPES.CANONICAL;
const { AUDIT_STEP_DESTINATIONS } = Audit;

/**
* Check if canonical is self-referenced (ignoring protocol and case)
*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. This function does not check anything, this is a get function that returns the pathname lowercased. Please update the documentation accordingly.
  2. remove unnecessary whitespace.
  3. I suggest removing this function entirely (see comment below about validation)

*/
export function normalizeUrlExport(url) {
try {
const urlObj = new URL(url);
// Remove protocol, domain, query params, and hash; lowercase; keep only pathname
return urlObj.pathname.toLowerCase();
} catch {
return url.toLowerCase();
}
}

/**
* Step 1: Import top pages (used in multi-step audit)
*/
Expand Down Expand Up @@ -325,6 +339,161 @@ export async function validateCanonicalRecursively(
return checks;
}

/*
* check if a scraped object have valid canonical url tag
*/
export async function validateCanonicalTag(scrapedObject, context, key = '') {
const {
site, log,
} = context;
const baseURL = site.getBaseURL();

try {
if (!scrapedObject?.scrapeResult?.canonical) {
log.warn(`[canonical] No canonical metadata in S3 object: ${key}`);
return { url: null, canonicalUrl: null, checks: [] };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is no canonical scrape result here, shouldn't this be flagged as a something like CANONICAL_CHECKS.CANONICAL_TAG_MISSING and not silently returned?

}

const url = scrapedObject.url || scrapedObject.finalUrl;
if (!url) {
log.warn(`[canonical] No URL found in S3 object: ${key}`);
return { url: null, canonicalUrl: null, checks: [] };
}

const finalUrl = scrapedObject.finalUrl || url;
Comment on lines +357 to +363
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If scrapedObject.url exists we set it to url, but then on line 363 we flip it and say if scrapedObject.finalUrl is available we us it over the line on 357. This is a bit confusing.

The logic has a subtle redundancy and could be misleading:
Line 357: const url = scrapedObject.url || scrapedObject.finalUrl
Line 363: const finalUrl = scrapedObject.finalUrl || url

If scrapedObject.url is missing but scrapedObject.finalUrl exists, then url and finalUrl will be the same value (both set to scrapedObject.finalUrl). The finalUrl fallback on line 363 becomes a no-op, and more importantly, the semantic distinction between "the URL we requested" and "the URL we landed on" is lost.

const url = scrapedObject.url;
if (!url) {
  log.warn(`[canonical] No URL found in S3 object: ${key}`);
  return { url: null, canonicalUrl: null, checks: [] };
}

const finalUrl = scrapedObject.finalUrl || url;

This preserves the semantic distinction: url is always the original requested URL, and finalUrl is the resolved destination (defaulting to url when no redirect occurred). If scrapedObject.url is genuinely absent, it's a real data problem worth rejecting cleanly — not silently masking by substituting finalUrl.


// Filter out scraped pages that redirected to auth/login pages or PDFs
// This prevents false positives when a legitimate page redirects to login
if (isAuthUrl(finalUrl)) {
log.info(`[canonical] Skipping ${url} - redirected to auth page: ${finalUrl}`);
return { url: null, canonicalUrl: null, checks: [] };
}
if (isPdfUrl(finalUrl)) {
log.info(`[canonical] Skipping ${url} - redirected to PDF: ${finalUrl}`);
return { url: null, canonicalUrl: null, checks: [] };
}

const isPreview = isPreviewPage(baseURL);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking isPreview here and using it way down in the code, it would be cleaner if we move this check closer to where isPreview is used.


// Use canonical metadata already extracted by the scraper (Puppeteer)
const canonicalMetadata = scrapedObject.scrapeResult.canonical;
const canonicalUrl = canonicalMetadata.href || null;
const canonicalTagChecks = [];

// Check if canonical tag exists
if (!canonicalMetadata.exists || !canonicalUrl) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might need to be broken down into two checks as we might have metadata, but the url is missing and we are returning a response saying the tag is missing when it isn't.

if (!canonicalMetadata.exists) {
  canonicalTagChecks.push({
    check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
    success: false,
    explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
  });
} else if (!canonicalUrl) {
  // Tag exists but href is absent or empty
  canonicalTagChecks.push({
    check: CANONICAL_CHECKS.CANONICAL_TAG_NO_HREF.check,
    success: false,
    explanation: CANONICAL_CHECKS.CANONICAL_TAG_NO_HREF.explanation,
  });
}

This requires a dedicated CANONICAL_TAG_NO_HREF check constant, but it surfaces a genuinely distinct failure mode rather than silently mislabeling a malformed tag as a missing one.

canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
});
} else {
// Canonical tag exists
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
success: true,
});

// Check if canonical is in <head>
if (!canonicalMetadata.inHead) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check,
success: true,
});
}

// Check if there are multiple canonical tags
if (canonicalMetadata.count > 1) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check,
success: true,
});
}

// Check if canonical is nonempty
if (!canonicalUrl || canonicalUrl.trim() === '') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use hasText(canonicalUrl) from spacecat-shared-utils.

canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check,
success: true,
});
}

const normalizedCanonical = normalizeUrlExport(canonicalUrl);
const normalizedFinal = normalizeUrlExport(finalUrl);
Comment on lines +439 to +440
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove the normalizeUrlExport function and create a new function called isSelfReferencing that takes the two URLs and validates if they are the same url. This would be a cleaner approach isolating the logic of what validates two urls are the same.


// Canonical should match the final URL (what was actually served)
const isSelfReferenced = normalizedCanonical === normalizedFinal;
if (isSelfReferenced) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
success: true,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation,
});
}
}

const checks = [...canonicalTagChecks];

if (canonicalUrl) {
const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log, isPreview);
checks.push(...urlFormatChecks);

// self-reference check
const selfRefCheck = canonicalTagChecks.find(
(c) => c.check === CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
);
const isSelfReferenced = selfRefCheck?.success === true;

// if self-referenced - skip accessibility
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reading this comment had me think that we are skipping an accessibility audit, not the fact that the url was a 200.

if (isSelfReferenced) {
checks.push({
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we have a set of checks that pass when the url is valid, this allows for future conditions to be added with easy and maintainability:

const ACCESSIBILITY_CHECKS = [
  CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
  CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
];

And then we could:

if (isSelfReferenced) {
  checks.push(...ACCESSIBILITY_CHECKS.map((check) => ({ check, success: true })));
} else {
  ...
}

check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
success: true,
});
checks.push({
check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
success: true,
});
} else {
// if not self-referenced - validate accessibility

const options = await getPreviewAuthOptions(isPreview, baseURL, site, context, log);

const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log, options);
checks.push(...urlContentCheck);
}
}

return { url, canonicalUrl, checks };
Comment on lines +481 to +490
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatting is required.

} catch (error) {
log.error(`[canonical] Error processing scraped content from ${key}: ${error.message}`);
return { url: null, canonicalUrl: null, checks: [] };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define a default object here that can be used throughout the code base that has these values and return it.

}
}

/**
* Generates a suggestion for fixing a canonical issue based on the check type.
*
Expand Down Expand Up @@ -391,155 +560,11 @@ export async function processScrapedContent(context) {
return null;
}

if (!scrapedObject?.scrapeResult?.canonical) {
log.warn(`[canonical] No canonical metadata in S3 object: ${key}`);
return null;
}

const url = scrapedObject.url || scrapedObject.finalUrl;
if (!url) {
log.warn(`[canonical] No URL found in S3 object: ${key}`);
const result = await validateCanonicalTag(scrapedObject, context, key);
if (!result.url) {
return null;
}

const finalUrl = scrapedObject.finalUrl || url;

// Filter out scraped pages that redirected to auth/login pages or PDFs
// This prevents false positives when a legitimate page redirects to login
if (isAuthUrl(finalUrl)) {
log.info(`[canonical] Skipping ${url} - redirected to auth page: ${finalUrl}`);
return null;
}
if (isPdfUrl(finalUrl)) {
log.info(`[canonical] Skipping ${url} - redirected to PDF: ${finalUrl}`);
return null;
}

const isPreview = isPreviewPage(baseURL);

// Use canonical metadata already extracted by the scraper (Puppeteer)
const canonicalMetadata = scrapedObject.scrapeResult.canonical;
const canonicalUrl = canonicalMetadata.href || null;
const canonicalTagChecks = [];

// Check if canonical tag exists
if (!canonicalMetadata.exists || !canonicalUrl) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
});
} else {
// Canonical tag exists
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
success: true,
});

// Check if canonical is in <head>
if (!canonicalMetadata.inHead) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_OUTSIDE_HEAD.check,
success: true,
});
}

// Check if there are multiple canonical tags
if (canonicalMetadata.count > 1) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_MULTIPLE.check,
success: true,
});
}

// Check if canonical is nonempty
if (!canonicalUrl || canonicalUrl.trim() === '') {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.explanation,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check,
success: true,
});
}

// Check if canonical is self-referenced (ignoring protocol, domain, query, hash, case)
const normalizeUrl = (u) => {
try {
const urlObj = new URL(u);
// Remove protocol, domain, query params, and hash; lowercase; keep only pathname
return urlObj.pathname.toLowerCase();
} catch {
return u.toLowerCase();
}
};
const normalizedCanonical = normalizeUrl(canonicalUrl);
const normalizedFinal = normalizeUrl(finalUrl);

// Canonical should match the final URL path (what was actually served)
const isSelfReferenced = normalizedCanonical === normalizedFinal;
if (isSelfReferenced) {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
success: true,
});
} else {
canonicalTagChecks.push({
check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation,
});
}
}

const checks = [...canonicalTagChecks];

if (canonicalUrl) {
const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log, isPreview);
checks.push(...urlFormatChecks);

// self-reference check
const selfRefCheck = canonicalTagChecks.find(
(c) => c.check === CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
);
const isSelfReferenced = selfRefCheck?.success === true;

// if self-referenced - skip accessibility
if (isSelfReferenced) {
checks.push({
check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
success: true,
});
checks.push({
check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
success: true,
});
} else {
// if not self-referenced - validate accessibility

const options = await getPreviewAuthOptions(isPreview, baseURL, site, context, log);

const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log, options);
checks.push(...urlContentCheck);
}
}

return { url, checks };
return { url: result.url, checks: result.checks };
} catch (error) {
log.error(`[canonical] Error processing scraped content from ${key}: ${error.message}`);
return null;
Expand Down
Loading