Improve URL checks to reduce false-negatives

This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Improve URL matching to exclude URLs within Markdown inline code block
  and support URLs containing parentheses.
- Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to
  allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Update the user agent pool to modern browsers and platforms.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
This commit is contained in:
undergroundwires
2024-03-13 11:34:19 +01:00
parent e7218850ba
commit 287b8e61a0
18 changed files with 363 additions and 222 deletions

View File

@@ -1,19 +1,22 @@
import { indentText } from '@tests/shared/Text';
import { fetchWithTimeout } from './FetchWithTimeout';
import { getDomainFromUrl } from './UrlDomainProcessing';
export function fetchFollow(
url: string,
timeoutInMs: number,
fetchOptions: RequestInit,
followOptions: IFollowOptions | undefined,
fetchOptions?: Partial<RequestInit>,
followOptions?: Partial<FollowOptions>,
): Promise<Response> {
const defaultedFollowOptions = {
const defaultedFollowOptions: Required<FollowOptions> = {
...DefaultFollowOptions,
...followOptions,
};
if (followRedirects(defaultedFollowOptions)) {
console.log(indentText(`Follow options: ${JSON.stringify(defaultedFollowOptions)}`));
if (!followRedirects(defaultedFollowOptions)) {
return fetchWithTimeout(url, timeoutInMs, fetchOptions);
}
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */, mode: 'cors' };
const cookies = new CookieStorage(defaultedFollowOptions.enableCookies);
return followRecursivelyWithCookies(
url,
@@ -24,13 +27,13 @@ export function fetchFollow(
);
}
export interface IFollowOptions {
followRedirects?: boolean;
maximumRedirectFollowDepth?: number;
enableCookies?: boolean;
export interface FollowOptions {
readonly followRedirects?: boolean;
readonly maximumRedirectFollowDepth?: number;
readonly enableCookies?: boolean;
}
export const DefaultFollowOptions: Required<IFollowOptions> = {
const DefaultFollowOptions: Required<FollowOptions> = {
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
@@ -64,6 +67,10 @@ async function followRecursivelyWithCookies(
if (cookieHeader) {
cookies.addHeader(cookieHeader);
}
options.headers = {
...options.headers,
Host: getDomainFromUrl(nextUrl),
};
return followRecursivelyWithCookies(nextUrl, timeoutInMs, options, newFollowDepth, cookies);
}
@@ -77,7 +84,7 @@ class CookieStorage {
constructor(private readonly enabled: boolean) {
}
public hasAny() {
public hasAny(): boolean {
return this.enabled && this.cookies.length > 0;
}
@@ -88,17 +95,17 @@ class CookieStorage {
this.cookies.push(header);
}
public getHeader() {
public getHeader(): string {
return this.cookies.join(' ; ');
}
}
function followRedirects(options: IFollowOptions) {
if (!options.followRedirects) {
function followRedirects(options: FollowOptions): boolean {
if (options.followRedirects !== true) {
return false;
}
if (options.maximumRedirectFollowDepth === 0) {
return false;
if (options.maximumRedirectFollowDepth === undefined || options.maximumRedirectFollowDepth <= 0) {
throw new Error('Invalid followRedirects configuration: maximumRedirectFollowDepth must be a positive integer');
}
return true;
}