Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Improve URL matching to exclude URLs within Markdown inline code block and support URLs containing parentheses. - Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Update the user agent pool to modern browsers and platforms. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
This commit is contained in:
@@ -1,70 +1,123 @@
|
||||
import { indentText } from '@tests/shared/Text';
|
||||
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
|
||||
import { fetchFollow, type IFollowOptions, DefaultFollowOptions } from './FetchFollow';
|
||||
import { fetchFollow, type FollowOptions } from './FetchFollow';
|
||||
import { getRandomUserAgent } from './UserAgents';
|
||||
import type { IUrlStatus } from './IUrlStatus';
|
||||
import { getDomainFromUrl } from './UrlDomainProcessing';
|
||||
import { randomizeTlsFingerprint, getTlsContextInfo } from './TlsFingerprintRandomizer';
|
||||
import type { UrlStatus } from './UrlStatus';
|
||||
|
||||
export function getUrlStatus(
|
||||
url: string,
|
||||
options: IRequestOptions = DefaultOptions,
|
||||
): Promise<IUrlStatus> {
|
||||
const defaultedOptions = { ...DefaultOptions, ...options };
|
||||
const fetchOptions = getFetchOptions(url, defaultedOptions);
|
||||
return retryWithExponentialBackOff(async () => {
|
||||
console.log('Requesting', url);
|
||||
let result: IUrlStatus;
|
||||
try {
|
||||
const response = await fetchFollow(
|
||||
url,
|
||||
defaultedOptions.requestTimeoutInMs,
|
||||
fetchOptions,
|
||||
defaultedOptions.followOptions,
|
||||
);
|
||||
result = { url, code: response.status };
|
||||
} catch (err) {
|
||||
result = { url, error: JSON.stringify(err, null, '\t') };
|
||||
}
|
||||
return result;
|
||||
}, defaultedOptions.retryExponentialBaseInMs);
|
||||
requestOptions?: Partial<RequestOptions>,
|
||||
followOptions?: Partial<FollowOptions>,
|
||||
): Promise<UrlStatus> {
|
||||
const defaultedOptions = getDefaultedRequestOptions(requestOptions);
|
||||
if (defaultedOptions.randomizeTlsFingerprint) {
|
||||
randomizeTlsFingerprint();
|
||||
}
|
||||
return fetchUrlStatusWithRetry(url, defaultedOptions, followOptions);
|
||||
}
|
||||
|
||||
export interface IRequestOptions {
|
||||
export interface RequestOptions {
|
||||
readonly retryExponentialBaseInMs?: number;
|
||||
readonly additionalHeaders?: Record<string, string>;
|
||||
readonly additionalHeadersUrlIgnore?: string[];
|
||||
readonly followOptions?: IFollowOptions;
|
||||
readonly requestTimeoutInMs: number;
|
||||
readonly randomizeTlsFingerprint: boolean;
|
||||
readonly forceHttpGetForUrlPatterns: RegExp[];
|
||||
}
|
||||
|
||||
const DefaultOptions: Required<IRequestOptions> = {
|
||||
retryExponentialBaseInMs: 5000,
|
||||
const DefaultOptions: Required<RequestOptions> = {
|
||||
retryExponentialBaseInMs: 5 /* sec */ * 1000,
|
||||
additionalHeaders: {},
|
||||
additionalHeadersUrlIgnore: [],
|
||||
requestTimeoutInMs: 60 /* seconds */ * 1000,
|
||||
followOptions: DefaultFollowOptions,
|
||||
randomizeTlsFingerprint: true,
|
||||
forceHttpGetForUrlPatterns: [],
|
||||
};
|
||||
|
||||
function getFetchOptions(url: string, options: Required<IRequestOptions>): RequestInit {
|
||||
function fetchUrlStatusWithRetry(
|
||||
url: string,
|
||||
requestOptions: Required<RequestOptions>,
|
||||
followOptions?: Partial<FollowOptions>,
|
||||
): Promise<UrlStatus> {
|
||||
const fetchOptions = getFetchOptions(url, requestOptions);
|
||||
return retryWithExponentialBackOff(async () => {
|
||||
console.log(`🚀 Initiating request for URL: ${url}`);
|
||||
console.log(indentText([
|
||||
`HTTP method: ${fetchOptions.method}`,
|
||||
`Request options: ${JSON.stringify(requestOptions)}`,
|
||||
].join('\n')));
|
||||
let result: UrlStatus;
|
||||
try {
|
||||
const response = await fetchFollow(
|
||||
url,
|
||||
requestOptions.requestTimeoutInMs,
|
||||
fetchOptions,
|
||||
followOptions,
|
||||
);
|
||||
result = { url, code: response.status };
|
||||
} catch (err) {
|
||||
result = {
|
||||
url,
|
||||
error: [
|
||||
'Error:', indentText(JSON.stringify(err, null, '\t') || err.toString()),
|
||||
'Fetch options:', indentText(JSON.stringify(fetchOptions, null, '\t')),
|
||||
'Request options:', indentText(JSON.stringify(requestOptions, null, '\t')),
|
||||
'TLS:', indentText(getTlsContextInfo()),
|
||||
].join('\n'),
|
||||
};
|
||||
}
|
||||
return result;
|
||||
}, requestOptions.retryExponentialBaseInMs);
|
||||
}
|
||||
|
||||
function getFetchOptions(url: string, options: Required<RequestOptions>): RequestInit {
|
||||
const additionalHeaders = options.additionalHeadersUrlIgnore
|
||||
.some((ignorePattern) => url.startsWith(ignorePattern))
|
||||
? {}
|
||||
: options.additionalHeaders;
|
||||
return {
|
||||
method: 'HEAD',
|
||||
method: getHttpMethod(url, options),
|
||||
headers: {
|
||||
...getDefaultHeaders(),
|
||||
...getDefaultHeaders(url),
|
||||
...additionalHeaders,
|
||||
},
|
||||
redirect: 'manual', // Redirects are handled manually, automatic redirects do not work with Host header
|
||||
};
|
||||
}
|
||||
|
||||
function getDefaultHeaders(): Record<string, string> {
|
||||
function getHttpMethod(url: string, options: Required<RequestOptions>): 'HEAD' | 'GET' {
|
||||
if (options.forceHttpGetForUrlPatterns.some((pattern) => url.match(pattern))) {
|
||||
return 'GET';
|
||||
}
|
||||
// By default fetch only headers without the full response body for better speed
|
||||
return 'HEAD';
|
||||
}
|
||||
|
||||
function getDefaultHeaders(url: string): Record<string, string> {
|
||||
return {
|
||||
'user-agent': getRandomUserAgent(),
|
||||
'upgrade-insecure-requests': '1',
|
||||
connection: 'keep-alive',
|
||||
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'cache-control': 'max-age=0',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
// Needed for websites that filter out non-browser user agents.
|
||||
'User-Agent': getRandomUserAgent(),
|
||||
|
||||
// Required for some websites, especially those behind proxies, to correctly handle the request.
|
||||
Host: getDomainFromUrl(url),
|
||||
|
||||
// The following mimic a real browser request to improve compatibility with most web servers.
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
Connection: 'keep-alive',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
};
|
||||
}
|
||||
|
||||
function getDefaultedRequestOptions(
|
||||
options?: Partial<RequestOptions>,
|
||||
): Required<RequestOptions> {
|
||||
return {
|
||||
...DefaultOptions,
|
||||
...options,
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user