This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Improve URL matching to exclude URLs within Markdown inline code block and support URLs containing parentheses. - Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Update the user agent pool to modern browsers and platforms. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
79 lines
2.4 KiB
TypeScript
79 lines
2.4 KiB
TypeScript
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
|
|
import { getUrlStatus, type RequestOptions } from './Requestor';
|
|
import { groupUrlsByDomain } from './UrlDomainProcessing';
|
|
import type { FollowOptions } from './FetchFollow';
|
|
import type { UrlStatus } from './UrlStatus';
|
|
|
|
export async function getUrlStatusesInParallel(
|
|
urls: string[],
|
|
options?: BatchRequestOptions,
|
|
): Promise<UrlStatus[]> {
|
|
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
|
|
const uniqueUrls = Array.from(new Set(urls));
|
|
const defaultedDomainOptions: Required<DomainOptions> = {
|
|
...DefaultDomainOptions,
|
|
...options?.domainOptions,
|
|
};
|
|
console.log('Batch request options applied:', defaultedDomainOptions);
|
|
const results = await request(uniqueUrls, defaultedDomainOptions, options);
|
|
return results;
|
|
}
|
|
|
|
export interface BatchRequestOptions {
|
|
readonly domainOptions?: Partial<DomainOptions>;
|
|
readonly requestOptions?: Partial<RequestOptions>;
|
|
readonly followOptions?: Partial<FollowOptions>;
|
|
}
|
|
|
|
interface DomainOptions {
|
|
readonly sameDomainParallelize?: boolean;
|
|
readonly sameDomainDelayInMs?: number;
|
|
}
|
|
|
|
const DefaultDomainOptions: Required<DomainOptions> = {
|
|
sameDomainParallelize: false,
|
|
sameDomainDelayInMs: 3 /* sec */ * 1000,
|
|
};
|
|
|
|
function request(
|
|
urls: string[],
|
|
domainOptions: Required<DomainOptions>,
|
|
options?: BatchRequestOptions,
|
|
): Promise<UrlStatus[]> {
|
|
if (!domainOptions.sameDomainParallelize) {
|
|
return runOnEachDomainWithDelay(
|
|
urls,
|
|
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
|
|
domainOptions.sameDomainDelayInMs,
|
|
);
|
|
}
|
|
return Promise.all(
|
|
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
|
|
);
|
|
}
|
|
|
|
async function runOnEachDomainWithDelay(
|
|
urls: string[],
|
|
action: (url: string) => Promise<UrlStatus>,
|
|
delayInMs: number | undefined,
|
|
): Promise<UrlStatus[]> {
|
|
const grouped = groupUrlsByDomain(urls);
|
|
const tasks = grouped.map(async (group) => {
|
|
const results = new Array<UrlStatus>();
|
|
/* eslint-disable no-await-in-loop */
|
|
for (const url of group) {
|
|
const status = await action(url);
|
|
results.push(status);
|
|
if (results.length !== group.length) {
|
|
if (delayInMs !== undefined) {
|
|
await sleep(delayInMs);
|
|
}
|
|
}
|
|
}
|
|
/* eslint-enable no-await-in-loop */
|
|
return results;
|
|
});
|
|
const r = await Promise.all(tasks);
|
|
return r.flat();
|
|
}
|