This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Exclude URLs within Markdown inline code blocks. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Add URL exclusion support for non-responsive URLs. - Update the user agent pool to modern browsers and platforms. - Improve CI/CD workflow to respond to modifications in the `test/checks/external-urls` directory, offering immediate feedback on potential impacts to the external URL test. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
109 lines
3.6 KiB
TypeScript
109 lines
3.6 KiB
TypeScript
import { indentText } from '@tests/shared/Text';
|
|
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
|
|
import { fetchFollow, type FollowOptions } from './FetchFollow';
|
|
import { getRandomUserAgent } from './UserAgents';
|
|
import { getDomainFromUrl } from './UrlDomainProcessing';
|
|
import { randomizeTlsFingerprint, getTlsContextInfo } from './TlsFingerprintRandomizer';
|
|
import type { UrlStatus } from './UrlStatus';
|
|
|
|
export function getUrlStatus(
|
|
url: string,
|
|
requestOptions?: Partial<RequestOptions>,
|
|
followOptions?: Partial<FollowOptions>,
|
|
): Promise<UrlStatus> {
|
|
const defaultedOptions = getDefaultedRequestOptions(requestOptions);
|
|
if (defaultedOptions.randomizeTlsFingerprint) {
|
|
randomizeTlsFingerprint();
|
|
}
|
|
return fetchUrlStatusWithRetry(url, defaultedOptions, followOptions);
|
|
}
|
|
|
|
export interface RequestOptions {
|
|
readonly retryExponentialBaseInMs?: number;
|
|
readonly additionalHeaders?: Record<string, string>;
|
|
readonly additionalHeadersUrlIgnore?: string[];
|
|
readonly requestTimeoutInMs: number;
|
|
readonly randomizeTlsFingerprint: boolean;
|
|
}
|
|
|
|
const DefaultOptions: Required<RequestOptions> = {
|
|
retryExponentialBaseInMs: 5 /* sec */ * 1000,
|
|
additionalHeaders: {},
|
|
additionalHeadersUrlIgnore: [],
|
|
requestTimeoutInMs: 60 /* seconds */ * 1000,
|
|
randomizeTlsFingerprint: true,
|
|
};
|
|
|
|
function fetchUrlStatusWithRetry(
|
|
url: string,
|
|
requestOptions: Required<RequestOptions>,
|
|
followOptions?: Partial<FollowOptions>,
|
|
): Promise<UrlStatus> {
|
|
const fetchOptions = getFetchOptions(url, requestOptions);
|
|
return retryWithExponentialBackOff(async () => {
|
|
console.log(`Initiating request for URL: ${url}`);
|
|
let result: UrlStatus;
|
|
try {
|
|
const response = await fetchFollow(
|
|
url,
|
|
requestOptions.requestTimeoutInMs,
|
|
fetchOptions,
|
|
followOptions,
|
|
);
|
|
result = { url, code: response.status };
|
|
} catch (err) {
|
|
result = {
|
|
url,
|
|
error: [
|
|
'Error:', indentText(JSON.stringify(err, null, '\t') || err.toString()),
|
|
'Options:', indentText(JSON.stringify(fetchOptions, null, '\t')),
|
|
'TLS:', indentText(getTlsContextInfo()),
|
|
].join('\n'),
|
|
};
|
|
}
|
|
return result;
|
|
}, requestOptions.retryExponentialBaseInMs);
|
|
}
|
|
|
|
function getFetchOptions(url: string, options: Required<RequestOptions>): RequestInit {
|
|
const additionalHeaders = options.additionalHeadersUrlIgnore
|
|
.some((ignorePattern) => url.startsWith(ignorePattern))
|
|
? {}
|
|
: options.additionalHeaders;
|
|
return {
|
|
method: 'GET', // Fetch only headers without the full response body for better speed
|
|
headers: {
|
|
...getDefaultHeaders(url),
|
|
...additionalHeaders,
|
|
},
|
|
redirect: 'manual', // Redirects are handled manually, automatic redirects do not work with Host header
|
|
};
|
|
}
|
|
|
|
function getDefaultHeaders(url: string): Record<string, string> {
|
|
return {
|
|
// Needed for websites that filter out non-browser user agents.
|
|
'User-Agent': getRandomUserAgent(),
|
|
|
|
// Required for some websites, especially those behind proxies, to correctly handle the request.
|
|
Host: getDomainFromUrl(url),
|
|
|
|
// The following mimic a real browser request to improve compatibility with most web servers.
|
|
'Upgrade-Insecure-Requests': '1',
|
|
Connection: 'keep-alive',
|
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Cache-Control': 'max-age=0',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
};
|
|
}
|
|
|
|
function getDefaultedRequestOptions(
|
|
options?: Partial<RequestOptions>,
|
|
): Required<RequestOptions> {
|
|
return {
|
|
...DefaultOptions,
|
|
...options,
|
|
};
|
|
}
|