Improve URL checks to reduce false-negatives

This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Improve URL matching to exclude URLs within Markdown inline code block and support URLs containing parentheses. - Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Update the user agent pool to modern browsers and platforms. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
2024-03-16 18:15:34 +01:00
parent e7218850ba
commit 5abf8ff216
18 changed files with 363 additions and 222 deletions
--- a/tests/checks/external-urls/main.spec.ts
+++ b/tests/checks/external-urls/main.spec.ts
@@ -1,50 +1,84 @@
 import { test, expect } from 'vitest';
 import { parseApplication } from '@/application/Parser/ApplicationParser';
 import type { IApplication } from '@/domain/IApplication';
-import { getUrlStatusesInParallel, type IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
-import type { IUrlStatus } from './StatusChecker/IUrlStatus';
+import { indentText } from '@tests/shared/Text';
+import { formatAssertionMessage } from '@tests/shared/FormatAssertionMessage';
+import { type UrlStatus, formatUrlStatus } from './StatusChecker/UrlStatus';
+import { getUrlStatusesInParallel, type BatchRequestOptions } from './StatusChecker/BatchStatusChecker';

+// arrange
 const app = parseApplication();
-const urls = collectUniqueUrls(app);
-const requestOptions: IBatchRequestOptions = {
+const urls = collectUniqueUrls({
+  application: app,
+  excludePatterns: [
+    /^https:\/\/archive\.ph/, // Drops HEAD/GET requests via fetch/curl, responding to Postman/Chromium.
+  ],
+});
+const requestOptions: BatchRequestOptions = {
  domainOptions: {
-    sameDomainParallelize: false, // be nice to our external servers
+    sameDomainParallelize: false, // be nice to our third-party servers
    sameDomainDelayInMs: 5 /* sec */ * 1000,
  },
  requestOptions: {
    retryExponentialBaseInMs: 3 /* sec */ * 1000,
    requestTimeoutInMs: 60 /* sec */ * 1000,
    additionalHeaders: { referer: app.projectDetails.homepage },
+    randomizeTlsFingerprint: true,
+  },
+  followOptions: {
+    followRedirects: true,
+    enableCookies: true,
  },
 };
 const testTimeoutInMs = urls.length * 60 /* seconds */ * 1000;
-
 test(`all URLs (${urls.length}) should be alive`, async () => {
+  // act
  const results = await getUrlStatusesInParallel(urls, requestOptions);
-  const deadUrls = results.filter((r) => r.code !== 200);
-  expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
+  // assert
+  const deadUrls = results.filter((r) => r.code === undefined || !isOkStatusCode(r.code));
+  expect(deadUrls).to.have.lengthOf(0, formatAssertionMessage([formatUrlStatusReport(deadUrls)]));
 }, testTimeoutInMs);

-function collectUniqueUrls(application: IApplication): string[] {
+function isOkStatusCode(statusCode: number): boolean {
+  return statusCode >= 200 && statusCode < 300;
+}
+
+function collectUniqueUrls(
+  options: {
+    readonly application: IApplication,
+    readonly excludePatterns?: readonly RegExp[],
+  },
+): string[] {
  return [ // Get all nodes
-    ...application.collections.flatMap((c) => c.getAllCategories()),
-    ...application.collections.flatMap((c) => c.getAllScripts()),
+    ...options.application.collections.flatMap((c) => c.getAllCategories()),
+    ...options.application.collections.flatMap((c) => c.getAllScripts()),
  ]
    // Get all docs
    .flatMap((documentable) => documentable.docs)
    // Parse all URLs
-    .flatMap((docString) => docString.match(/(https?:\/\/[^\s`"<>()]+)/g) || [])
+    .flatMap((docString) => extractUrls(docString))
    // Remove duplicates
-    .filter((url, index, array) => array.indexOf(url) === index);
+    .filter((url, index, array) => array.indexOf(url) === index)
+    // Exclude certain URLs based on patterns
+    .filter((url) => !shouldExcludeUrl(url, options.excludePatterns ?? []));
 }

-function printUrls(statuses: IUrlStatus[]): string {
-  /* eslint-disable prefer-template */
-  return '\n'
-    + statuses.map((status) => `- ${status.url}\n`
-        + (status.code ? `\tResponse code: ${status.code}` : '')
-        + (status.error ? `\tError: ${status.error}` : ''))
-      .join('\n')
-    + '\n';
-  /* eslint-enable prefer-template */
+function shouldExcludeUrl(url: string, patterns: readonly RegExp[]): boolean {
+  return patterns.some((pattern) => pattern.test(url));
+}
+
+function formatUrlStatusReport(deadUrlStatuses: readonly UrlStatus[]): string {
+  return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
+}
+
+function extractUrls(textWithInlineCode: string): string[] {
+  /*
+    Matches URLs:
+    - Excludes inline code blocks as they may contain URLs not intended for user interaction
+      and not guaranteed to support expected HTTP methods, leading to false-negatives.
+    - Supports URLs containing parentheses, avoiding matches within code that might not represent
+      actual links.
+  */
+  const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+(?:\([^\s`"<>()]*\))?[^\s`"<>()]*)/g;
+  return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
 }