Improve URL checks to reduce false-negatives

This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Improve URL matching to exclude URLs within Markdown inline code block
  and support URLs containing parentheses.
- Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to
  allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Update the user agent pool to modern browsers and platforms.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
This commit is contained in:
undergroundwires
2024-03-13 11:34:19 +01:00
parent e7218850ba
commit 287b8e61a0
18 changed files with 363 additions and 222 deletions

View File

@@ -1,64 +1,65 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { getUrlStatus, type IRequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
import type { IUrlStatus } from './IUrlStatus';
import { getUrlStatus, type RequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlDomainProcessing';
import type { FollowOptions } from './FetchFollow';
import type { UrlStatus } from './UrlStatus';
export async function getUrlStatusesInParallel(
urls: string[],
options?: IBatchRequestOptions,
): Promise<IUrlStatus[]> {
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
const uniqueUrls = Array.from(new Set(urls));
const defaultedOptions = { ...DefaultOptions, ...options };
console.log('Options: ', defaultedOptions);
const results = await request(uniqueUrls, defaultedOptions);
const defaultedDomainOptions: Required<DomainOptions> = {
...DefaultDomainOptions,
...options?.domainOptions,
};
console.log('Batch request options applied:', defaultedDomainOptions);
const results = await request(uniqueUrls, defaultedDomainOptions, options);
return results;
}
export interface IBatchRequestOptions {
domainOptions?: IDomainOptions;
requestOptions?: IRequestOptions;
export interface BatchRequestOptions {
readonly domainOptions?: Partial<DomainOptions>;
readonly requestOptions?: Partial<RequestOptions>;
readonly followOptions?: Partial<FollowOptions>;
}
interface IDomainOptions {
sameDomainParallelize?: boolean;
sameDomainDelayInMs?: number;
interface DomainOptions {
readonly sameDomainParallelize?: boolean;
readonly sameDomainDelayInMs?: number;
}
const DefaultOptions: Required<IBatchRequestOptions> = {
domainOptions: {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
requestTimeoutInMs: 60 /* sec */ * 1000,
additionalHeaders: {},
},
const DefaultDomainOptions: Required<DomainOptions> = {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
};
function request(
urls: string[],
options: Required<IBatchRequestOptions>,
): Promise<IUrlStatus[]> {
if (!options.domainOptions.sameDomainParallelize) {
domainOptions: Required<DomainOptions>,
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
if (!domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelay(
urls,
(url) => getUrlStatus(url, options.requestOptions),
options.domainOptions.sameDomainDelayInMs,
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
domainOptions.sameDomainDelayInMs,
);
}
return Promise.all(urls.map((url) => getUrlStatus(url, options.requestOptions)));
return Promise.all(
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
);
}
async function runOnEachDomainWithDelay(
urls: string[],
action: (url: string) => Promise<IUrlStatus>,
action: (url: string) => Promise<UrlStatus>,
delayInMs: number | undefined,
): Promise<IUrlStatus[]> {
): Promise<UrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<IUrlStatus>();
const results = new Array<UrlStatus>();
/* eslint-disable no-await-in-loop */
for (const url of group) {
const status = await action(url);