Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false negatives. - Treat all 2XX status codes as successful, addressing issues with codes like `204`. - Exclude URLs within Markdown inline code blocks. - Send the Host header for improved handling of webpages behind proxies. - Improve formatting and context for output messages. - Fix the defaulting options for redirects and cookie handling. - Add URL exclusion support for non-responsive URLs. - Update the user agent pool to modern browsers and platforms. - Improve CI/CD workflow to respond to modifications in the `test/checks/external-urls` directory, offering immediate feedback on potential impacts to the external URL test. - Add support for randomizing TLS fingerprint to mimic various clients better, improving the effectiveness of checks. However, this is not fully supported by Node.js's HTTP client; see nodejs/undici#1983 for more details. - Use `AbortSignal` instead of `AbortController` as more modern and simpler way to handle timeouts.
This commit is contained in:
@@ -1,64 +1,62 @@
|
||||
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
|
||||
import { getUrlStatus, type IRequestOptions } from './Requestor';
|
||||
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
|
||||
import type { IUrlStatus } from './IUrlStatus';
|
||||
import { getUrlStatus, type RequestOptions } from './Requestor';
|
||||
import { groupUrlsByDomain } from './UrlDomainProcessing';
|
||||
import type { FollowOptions } from './FetchFollow';
|
||||
import type { UrlStatus } from './UrlStatus';
|
||||
|
||||
export async function getUrlStatusesInParallel(
|
||||
urls: string[],
|
||||
options?: IBatchRequestOptions,
|
||||
): Promise<IUrlStatus[]> {
|
||||
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
|
||||
options?: BatchRequestOptions,
|
||||
): Promise<UrlStatus[]> {
|
||||
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
|
||||
const uniqueUrls = Array.from(new Set(urls));
|
||||
const defaultedOptions = { ...DefaultOptions, ...options };
|
||||
console.log('Options: ', defaultedOptions);
|
||||
const results = await request(uniqueUrls, defaultedOptions);
|
||||
const defaultedDomainOptions = { ...DefaultDomainOptions, ...options?.domainOptions };
|
||||
console.log('Batch request options applied:', defaultedDomainOptions);
|
||||
const results = await request(uniqueUrls, defaultedDomainOptions, options);
|
||||
return results;
|
||||
}
|
||||
|
||||
export interface IBatchRequestOptions {
|
||||
domainOptions?: IDomainOptions;
|
||||
requestOptions?: IRequestOptions;
|
||||
export interface BatchRequestOptions {
|
||||
readonly domainOptions?: Partial<DomainOptions>;
|
||||
readonly requestOptions?: Partial<RequestOptions>;
|
||||
readonly followOptions?: Partial<FollowOptions>;
|
||||
}
|
||||
|
||||
interface IDomainOptions {
|
||||
sameDomainParallelize?: boolean;
|
||||
sameDomainDelayInMs?: number;
|
||||
interface DomainOptions {
|
||||
readonly sameDomainParallelize?: boolean;
|
||||
readonly sameDomainDelayInMs?: number;
|
||||
}
|
||||
|
||||
const DefaultOptions: Required<IBatchRequestOptions> = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: false,
|
||||
sameDomainDelayInMs: 3 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 5 /* sec */ * 1000,
|
||||
requestTimeoutInMs: 60 /* sec */ * 1000,
|
||||
additionalHeaders: {},
|
||||
},
|
||||
const DefaultDomainOptions: Required<DomainOptions> = {
|
||||
sameDomainParallelize: false,
|
||||
sameDomainDelayInMs: 3 /* sec */ * 1000,
|
||||
};
|
||||
|
||||
function request(
|
||||
urls: string[],
|
||||
options: Required<IBatchRequestOptions>,
|
||||
): Promise<IUrlStatus[]> {
|
||||
if (!options.domainOptions.sameDomainParallelize) {
|
||||
domainOptions: Required<DomainOptions>,
|
||||
options?: BatchRequestOptions,
|
||||
): Promise<UrlStatus[]> {
|
||||
if (!domainOptions.sameDomainParallelize) {
|
||||
return runOnEachDomainWithDelay(
|
||||
urls,
|
||||
(url) => getUrlStatus(url, options.requestOptions),
|
||||
options.domainOptions.sameDomainDelayInMs,
|
||||
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
|
||||
domainOptions.sameDomainDelayInMs,
|
||||
);
|
||||
}
|
||||
return Promise.all(urls.map((url) => getUrlStatus(url, options.requestOptions)));
|
||||
return Promise.all(
|
||||
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
|
||||
);
|
||||
}
|
||||
|
||||
async function runOnEachDomainWithDelay(
|
||||
urls: string[],
|
||||
action: (url: string) => Promise<IUrlStatus>,
|
||||
action: (url: string) => Promise<UrlStatus>,
|
||||
delayInMs: number | undefined,
|
||||
): Promise<IUrlStatus[]> {
|
||||
): Promise<UrlStatus[]> {
|
||||
const grouped = groupUrlsByDomain(urls);
|
||||
const tasks = grouped.map(async (group) => {
|
||||
const results = new Array<IUrlStatus>();
|
||||
const results = new Array<UrlStatus>();
|
||||
/* eslint-disable no-await-in-loop */
|
||||
for (const url of group) {
|
||||
const status = await action(url);
|
||||
|
||||
Reference in New Issue
Block a user