Files
privacy.sexy/tests/checks/external-urls/StatusChecker/BatchStatusChecker.ts
undergroundwires 287b8e61a0 Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Improve URL matching to exclude URLs within Markdown inline code block
  and support URLs containing parentheses.
- Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to
  allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Update the user agent pool to modern browsers and platforms.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
2024-03-15 13:42:29 +01:00

79 lines
2.4 KiB
TypeScript

import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { getUrlStatus, type RequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlDomainProcessing';
import type { FollowOptions } from './FetchFollow';
import type { UrlStatus } from './UrlStatus';
export async function getUrlStatusesInParallel(
urls: string[],
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
const uniqueUrls = Array.from(new Set(urls));
const defaultedDomainOptions: Required<DomainOptions> = {
...DefaultDomainOptions,
...options?.domainOptions,
};
console.log('Batch request options applied:', defaultedDomainOptions);
const results = await request(uniqueUrls, defaultedDomainOptions, options);
return results;
}
export interface BatchRequestOptions {
readonly domainOptions?: Partial<DomainOptions>;
readonly requestOptions?: Partial<RequestOptions>;
readonly followOptions?: Partial<FollowOptions>;
}
interface DomainOptions {
readonly sameDomainParallelize?: boolean;
readonly sameDomainDelayInMs?: number;
}
const DefaultDomainOptions: Required<DomainOptions> = {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
};
function request(
urls: string[],
domainOptions: Required<DomainOptions>,
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
if (!domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelay(
urls,
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
domainOptions.sameDomainDelayInMs,
);
}
return Promise.all(
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
);
}
async function runOnEachDomainWithDelay(
urls: string[],
action: (url: string) => Promise<UrlStatus>,
delayInMs: number | undefined,
): Promise<UrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<UrlStatus>();
/* eslint-disable no-await-in-loop */
for (const url of group) {
const status = await action(url);
results.push(status);
if (results.length !== group.length) {
if (delayInMs !== undefined) {
await sleep(delayInMs);
}
}
}
/* eslint-enable no-await-in-loop */
return results;
});
const r = await Promise.all(tasks);
return r.flat();
}