Files
privacy.sexy/tests/checks/external-urls/main.spec.ts
undergroundwires 5abf8ff216 Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Improve URL matching to exclude URLs within Markdown inline code block
  and support URLs containing parentheses.
- Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to
  allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Update the user agent pool to modern browsers and platforms.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
2024-03-16 18:15:34 +01:00

85 lines
3.2 KiB
TypeScript

import { test, expect } from 'vitest';
import { parseApplication } from '@/application/Parser/ApplicationParser';
import type { IApplication } from '@/domain/IApplication';
import { indentText } from '@tests/shared/Text';
import { formatAssertionMessage } from '@tests/shared/FormatAssertionMessage';
import { type UrlStatus, formatUrlStatus } from './StatusChecker/UrlStatus';
import { getUrlStatusesInParallel, type BatchRequestOptions } from './StatusChecker/BatchStatusChecker';
// arrange
const app = parseApplication();
const urls = collectUniqueUrls({
application: app,
excludePatterns: [
/^https:\/\/archive\.ph/, // Drops HEAD/GET requests via fetch/curl, responding to Postman/Chromium.
],
});
const requestOptions: BatchRequestOptions = {
domainOptions: {
sameDomainParallelize: false, // be nice to our third-party servers
sameDomainDelayInMs: 5 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 3 /* sec */ * 1000,
requestTimeoutInMs: 60 /* sec */ * 1000,
additionalHeaders: { referer: app.projectDetails.homepage },
randomizeTlsFingerprint: true,
},
followOptions: {
followRedirects: true,
enableCookies: true,
},
};
const testTimeoutInMs = urls.length * 60 /* seconds */ * 1000;
test(`all URLs (${urls.length}) should be alive`, async () => {
// act
const results = await getUrlStatusesInParallel(urls, requestOptions);
// assert
const deadUrls = results.filter((r) => r.code === undefined || !isOkStatusCode(r.code));
expect(deadUrls).to.have.lengthOf(0, formatAssertionMessage([formatUrlStatusReport(deadUrls)]));
}, testTimeoutInMs);
function isOkStatusCode(statusCode: number): boolean {
return statusCode >= 200 && statusCode < 300;
}
function collectUniqueUrls(
options: {
readonly application: IApplication,
readonly excludePatterns?: readonly RegExp[],
},
): string[] {
return [ // Get all nodes
...options.application.collections.flatMap((c) => c.getAllCategories()),
...options.application.collections.flatMap((c) => c.getAllScripts()),
]
// Get all docs
.flatMap((documentable) => documentable.docs)
// Parse all URLs
.flatMap((docString) => extractUrls(docString))
// Remove duplicates
.filter((url, index, array) => array.indexOf(url) === index)
// Exclude certain URLs based on patterns
.filter((url) => !shouldExcludeUrl(url, options.excludePatterns ?? []));
}
function shouldExcludeUrl(url: string, patterns: readonly RegExp[]): boolean {
return patterns.some((pattern) => pattern.test(url));
}
function formatUrlStatusReport(deadUrlStatuses: readonly UrlStatus[]): string {
return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
}
function extractUrls(textWithInlineCode: string): string[] {
/*
Matches URLs:
- Excludes inline code blocks as they may contain URLs not intended for user interaction
and not guaranteed to support expected HTTP methods, leading to false-negatives.
- Supports URLs containing parentheses, avoiding matches within code that might not represent
actual links.
*/
const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+(?:\([^\s`"<>()]*\))?[^\s`"<>()]*)/g;
return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
}