Files
privacy.sexy/tests/checks/external-urls/StatusChecker/Requestor.ts
undergroundwires be7a886225 Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Exclude URLs within Markdown inline code blocks.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Add URL exclusion support for non-responsive URLs.
- Update the user agent pool to modern browsers and platforms.
- Improve CI/CD workflow to respond to modifications in the
  `test/checks/external-urls` directory, offering immediate feedback on
  potential impacts to the external URL test.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
2024-03-13 18:26:16 +01:00

109 lines
3.6 KiB
TypeScript

import { indentText } from '@tests/shared/Text';
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
import { fetchFollow, type FollowOptions } from './FetchFollow';
import { getRandomUserAgent } from './UserAgents';
import { getDomainFromUrl } from './UrlDomainProcessing';
import { randomizeTlsFingerprint, getTlsContextInfo } from './TlsFingerprintRandomizer';
import type { UrlStatus } from './UrlStatus';
export function getUrlStatus(
url: string,
requestOptions?: Partial<RequestOptions>,
followOptions?: Partial<FollowOptions>,
): Promise<UrlStatus> {
const defaultedOptions = getDefaultedRequestOptions(requestOptions);
if (defaultedOptions.randomizeTlsFingerprint) {
randomizeTlsFingerprint();
}
return fetchUrlStatusWithRetry(url, defaultedOptions, followOptions);
}
export interface RequestOptions {
readonly retryExponentialBaseInMs?: number;
readonly additionalHeaders?: Record<string, string>;
readonly additionalHeadersUrlIgnore?: string[];
readonly requestTimeoutInMs: number;
readonly randomizeTlsFingerprint: boolean;
}
const DefaultOptions: Required<RequestOptions> = {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
additionalHeaders: {},
additionalHeadersUrlIgnore: [],
requestTimeoutInMs: 60 /* seconds */ * 1000,
randomizeTlsFingerprint: true,
};
function fetchUrlStatusWithRetry(
url: string,
requestOptions: Required<RequestOptions>,
followOptions?: Partial<FollowOptions>,
): Promise<UrlStatus> {
const fetchOptions = getFetchOptions(url, requestOptions);
return retryWithExponentialBackOff(async () => {
console.log(`Initiating request for URL: ${url}`);
let result: UrlStatus;
try {
const response = await fetchFollow(
url,
requestOptions.requestTimeoutInMs,
fetchOptions,
followOptions,
);
result = { url, code: response.status };
} catch (err) {
result = {
url,
error: [
'Error:', indentText(JSON.stringify(err, null, '\t') || err.toString()),
'Options:', indentText(JSON.stringify(fetchOptions, null, '\t')),
'TLS:', indentText(getTlsContextInfo()),
].join('\n'),
};
}
return result;
}, requestOptions.retryExponentialBaseInMs);
}
function getFetchOptions(url: string, options: Required<RequestOptions>): RequestInit {
const additionalHeaders = options.additionalHeadersUrlIgnore
.some((ignorePattern) => url.startsWith(ignorePattern))
? {}
: options.additionalHeaders;
return {
method: 'GET', // Fetch only headers without the full response body for better speed
headers: {
...getDefaultHeaders(url),
...additionalHeaders,
},
redirect: 'manual', // Redirects are handled manually, automatic redirects do not work with Host header
};
}
function getDefaultHeaders(url: string): Record<string, string> {
return {
// Needed for websites that filter out non-browser user agents.
'User-Agent': getRandomUserAgent(),
// Required for some websites, especially those behind proxies, to correctly handle the request.
Host: getDomainFromUrl(url),
// The following mimic a real browser request to improve compatibility with most web servers.
'Upgrade-Insecure-Requests': '1',
Connection: 'keep-alive',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'max-age=0',
'Accept-Language': 'en-US,en;q=0.9',
};
}
function getDefaultedRequestOptions(
options?: Partial<RequestOptions>,
): Required<RequestOptions> {
return {
...DefaultOptions,
...options,
};
}