Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Improve URL matching to exclude URLs within Markdown inline code block and support URLs containing parentheses. - Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Update the user agent pool to modern browsers and platforms. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
This commit is contained in:
@@ -1,50 +1,84 @@
|
||||
import { test, expect } from 'vitest';
|
||||
import { parseApplication } from '@/application/Parser/ApplicationParser';
|
||||
import type { IApplication } from '@/domain/IApplication';
|
||||
import { getUrlStatusesInParallel, type IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
|
||||
import type { IUrlStatus } from './StatusChecker/IUrlStatus';
|
||||
import { indentText } from '@tests/shared/Text';
|
||||
import { formatAssertionMessage } from '@tests/shared/FormatAssertionMessage';
|
||||
import { type UrlStatus, formatUrlStatus } from './StatusChecker/UrlStatus';
|
||||
import { getUrlStatusesInParallel, type BatchRequestOptions } from './StatusChecker/BatchStatusChecker';
|
||||
|
||||
// arrange
|
||||
const app = parseApplication();
|
||||
const urls = collectUniqueUrls(app);
|
||||
const requestOptions: IBatchRequestOptions = {
|
||||
const urls = collectUniqueUrls({
|
||||
application: app,
|
||||
excludePatterns: [
|
||||
/^https:\/\/archive\.ph/, // Drops HEAD/GET requests via fetch/curl, responding to Postman/Chromium.
|
||||
],
|
||||
});
|
||||
const requestOptions: BatchRequestOptions = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: false, // be nice to our external servers
|
||||
sameDomainParallelize: false, // be nice to our third-party servers
|
||||
sameDomainDelayInMs: 5 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 3 /* sec */ * 1000,
|
||||
requestTimeoutInMs: 60 /* sec */ * 1000,
|
||||
additionalHeaders: { referer: app.projectDetails.homepage },
|
||||
randomizeTlsFingerprint: true,
|
||||
},
|
||||
followOptions: {
|
||||
followRedirects: true,
|
||||
enableCookies: true,
|
||||
},
|
||||
};
|
||||
const testTimeoutInMs = urls.length * 60 /* seconds */ * 1000;
|
||||
|
||||
test(`all URLs (${urls.length}) should be alive`, async () => {
|
||||
// act
|
||||
const results = await getUrlStatusesInParallel(urls, requestOptions);
|
||||
const deadUrls = results.filter((r) => r.code !== 200);
|
||||
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
|
||||
// assert
|
||||
const deadUrls = results.filter((r) => r.code === undefined || !isOkStatusCode(r.code));
|
||||
expect(deadUrls).to.have.lengthOf(0, formatAssertionMessage([formatUrlStatusReport(deadUrls)]));
|
||||
}, testTimeoutInMs);
|
||||
|
||||
function collectUniqueUrls(application: IApplication): string[] {
|
||||
function isOkStatusCode(statusCode: number): boolean {
|
||||
return statusCode >= 200 && statusCode < 300;
|
||||
}
|
||||
|
||||
function collectUniqueUrls(
|
||||
options: {
|
||||
readonly application: IApplication,
|
||||
readonly excludePatterns?: readonly RegExp[],
|
||||
},
|
||||
): string[] {
|
||||
return [ // Get all nodes
|
||||
...application.collections.flatMap((c) => c.getAllCategories()),
|
||||
...application.collections.flatMap((c) => c.getAllScripts()),
|
||||
...options.application.collections.flatMap((c) => c.getAllCategories()),
|
||||
...options.application.collections.flatMap((c) => c.getAllScripts()),
|
||||
]
|
||||
// Get all docs
|
||||
.flatMap((documentable) => documentable.docs)
|
||||
// Parse all URLs
|
||||
.flatMap((docString) => docString.match(/(https?:\/\/[^\s`"<>()]+)/g) || [])
|
||||
.flatMap((docString) => extractUrls(docString))
|
||||
// Remove duplicates
|
||||
.filter((url, index, array) => array.indexOf(url) === index);
|
||||
.filter((url, index, array) => array.indexOf(url) === index)
|
||||
// Exclude certain URLs based on patterns
|
||||
.filter((url) => !shouldExcludeUrl(url, options.excludePatterns ?? []));
|
||||
}
|
||||
|
||||
function printUrls(statuses: IUrlStatus[]): string {
|
||||
/* eslint-disable prefer-template */
|
||||
return '\n'
|
||||
+ statuses.map((status) => `- ${status.url}\n`
|
||||
+ (status.code ? `\tResponse code: ${status.code}` : '')
|
||||
+ (status.error ? `\tError: ${status.error}` : ''))
|
||||
.join('\n')
|
||||
+ '\n';
|
||||
/* eslint-enable prefer-template */
|
||||
function shouldExcludeUrl(url: string, patterns: readonly RegExp[]): boolean {
|
||||
return patterns.some((pattern) => pattern.test(url));
|
||||
}
|
||||
|
||||
function formatUrlStatusReport(deadUrlStatuses: readonly UrlStatus[]): string {
|
||||
return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
|
||||
}
|
||||
|
||||
function extractUrls(textWithInlineCode: string): string[] {
|
||||
/*
|
||||
Matches URLs:
|
||||
- Excludes inline code blocks as they may contain URLs not intended for user interaction
|
||||
and not guaranteed to support expected HTTP methods, leading to false-negatives.
|
||||
- Supports URLs containing parentheses, avoiding matches within code that might not represent
|
||||
actual links.
|
||||
*/
|
||||
const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+(?:\([^\s`"<>()]*\))?[^\s`"<>()]*)/g;
|
||||
return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user