Improve URL checks to reduce false-negatives

This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Improve URL matching to exclude URLs within Markdown inline code block
  and support URLs containing parentheses.
- Add `forceHttpGetForUrlPatterns` to customize HTTP method per URL to
  allow verifying URLs behind CDN/WAFs that do not respond to HTTP HEAD.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Update the user agent pool to modern browsers and platforms.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
This commit is contained in:
undergroundwires
2024-03-13 11:34:19 +01:00
parent e7218850ba
commit 287b8e61a0
18 changed files with 363 additions and 222 deletions

View File

@@ -1,27 +1,33 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import type { IUrlStatus } from './IUrlStatus';
import { indentText } from '@tests/shared/Text';
import { type UrlStatus, formatUrlStatus } from './UrlStatus';
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
export async function retryWithExponentialBackOff(
action: () => Promise<IUrlStatus>,
action: () => Promise<UrlStatus>,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
currentRetry = 1,
): Promise<IUrlStatus> {
): Promise<UrlStatus> {
const maxTries = 3;
const status = await action();
if (shouldRetry(status)) {
if (currentRetry <= maxTries) {
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
console.log([
`Attempt ${currentRetry}: Retrying in ${exponentialBackOffInMs / 1000} seconds.`,
'Details:',
indentText(formatUrlStatus(status)),
].join('\n'));
await sleep(exponentialBackOffInMs);
return retryWithExponentialBackOff(action, baseRetryIntervalInMs, currentRetry + 1);
}
console.warn('💀 All retry attempts failed. Final failure to retrieve URL:', indentText(formatUrlStatus(status)));
}
return status;
}
function shouldRetry(status: IUrlStatus) {
function shouldRetry(status: UrlStatus): boolean {
if (status.error) {
return true;
}
@@ -32,14 +38,14 @@ function shouldRetry(status: IUrlStatus) {
|| status.code === 429; // Too Many Requests
}
function isTransientError(statusCode: number) {
function isTransientError(statusCode: number): boolean {
return statusCode >= 500 && statusCode <= 599;
}
function getRetryTimeoutInMs(
currentRetry: number,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
) {
): number {
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
// of the exponentially increasing base amount
const minRandom = 1 - retryRandomFactor;