Files
undergroundwires be7a886225 Improve URL checks to reduce false-negatives
This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Exclude URLs within Markdown inline code blocks.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Add URL exclusion support for non-responsive URLs.
- Update the user agent pool to modern browsers and platforms.
- Improve CI/CD workflow to respond to modifications in the
  `test/checks/external-urls` directory, offering immediate feedback on
  potential impacts to the external URL test.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
2024-03-13 18:26:16 +01:00

76 lines
2.4 KiB
TypeScript

import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { getUrlStatus, type RequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlDomainProcessing';
import type { FollowOptions } from './FetchFollow';
import type { UrlStatus } from './UrlStatus';
export async function getUrlStatusesInParallel(
urls: string[],
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
const uniqueUrls = Array.from(new Set(urls));
const defaultedDomainOptions = { ...DefaultDomainOptions, ...options?.domainOptions };
console.log('Batch request options applied:', defaultedDomainOptions);
const results = await request(uniqueUrls, defaultedDomainOptions, options);
return results;
}
export interface BatchRequestOptions {
readonly domainOptions?: Partial<DomainOptions>;
readonly requestOptions?: Partial<RequestOptions>;
readonly followOptions?: Partial<FollowOptions>;
}
interface DomainOptions {
readonly sameDomainParallelize?: boolean;
readonly sameDomainDelayInMs?: number;
}
const DefaultDomainOptions: Required<DomainOptions> = {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
};
function request(
urls: string[],
domainOptions: Required<DomainOptions>,
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
if (!domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelay(
urls,
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
domainOptions.sameDomainDelayInMs,
);
}
return Promise.all(
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
);
}
async function runOnEachDomainWithDelay(
urls: string[],
action: (url: string) => Promise<UrlStatus>,
delayInMs: number | undefined,
): Promise<UrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<UrlStatus>();
/* eslint-disable no-await-in-loop */
for (const url of group) {
const status = await action(url);
results.push(status);
if (results.length !== group.length) {
if (delayInMs !== undefined) {
await sleep(delayInMs);
}
}
}
/* eslint-enable no-await-in-loop */
return results;
});
const r = await Promise.all(tasks);
return r.flat();
}