Improve URL checks to reduce false-negatives

This commit improves the URL health checking mechanism to reduce false
negatives.

- Treat all 2XX status codes as successful, addressing issues with codes
  like `204`.
- Exclude URLs within Markdown inline code blocks.
- Send the Host header for improved handling of webpages behind proxies.
- Improve formatting and context for output messages.
- Fix the defaulting options for redirects and cookie handling.
- Add URL exclusion support for non-responsive URLs.
- Update the user agent pool to modern browsers and platforms.
- Improve CI/CD workflow to respond to modifications in the
  `test/checks/external-urls` directory, offering immediate feedback on
  potential impacts to the external URL test.
- Add support for randomizing TLS fingerprint to mimic various clients
  better, improving the effectiveness of checks. However, this is not
  fully supported by Node.js's HTTP client; see nodejs/undici#1983 for
  more details.
- Use `AbortSignal` instead of `AbortController` as more modern and
  simpler way to handle timeouts.
This commit is contained in:
undergroundwires
2024-03-13 11:34:19 +01:00
parent 4ac1425f76
commit be7a886225
19 changed files with 335 additions and 221 deletions

View File

@@ -3,6 +3,9 @@ name: checks.external-urls
on:
schedule:
- cron: '0 0 * * 0' # at 00:00 on every Sunday
push:
paths:
- tests/checks/external-urls/**
jobs:
run-check:

View File

@@ -1,7 +1,10 @@
export type SchedulerCallbackType = (...args: unknown[]) => void;
export type SchedulerType = (callback: SchedulerCallbackType, ms: number) => void;
export function sleep(time: number, scheduler: SchedulerType = setTimeout) {
export function sleep(
time: number,
scheduler: SchedulerType = setTimeout,
): Promise<void> {
return new Promise((resolve) => {
scheduler(() => resolve(undefined), time);
});

View File

@@ -1,4 +1,4 @@
import { splitTextIntoLines, indentText } from '../utils/text';
import { indentText, splitTextIntoLines } from '@tests/shared/Text';
import { log, die } from '../utils/log';
import { readAppLogFile } from './app-logs';
import { STDERR_IGNORE_PATTERNS } from './error-ignore-patterns';

View File

@@ -1,7 +1,7 @@
import { filterEmpty } from '@tests/shared/Text';
import { runCommand } from '../../utils/run-command';
import { log, LogLevel } from '../../utils/log';
import { SupportedPlatform, CURRENT_PLATFORM } from '../../utils/platform';
import { filterEmpty } from '../../utils/text';
export async function captureWindowTitles(processId: number) {
if (!processId) { throw new Error('Missing process ID.'); }

View File

@@ -1,3 +1,4 @@
import { indentText } from '@tests/shared/Text';
import { logCurrentArgs, CommandLineFlag, hasCommandLineFlag } from './cli-args';
import { log, die } from './utils/log';
import { ensureNpmProjectDir, npmInstall, npmBuild } from './utils/npm';
@@ -15,7 +16,6 @@ import {
APP_EXECUTION_DURATION_IN_SECONDS,
SCREENSHOT_PATH,
} from './config';
import { indentText } from './utils/text';
import type { ExtractionResult } from './app/extractors/common/extraction-result';
export async function main(): Promise<void> {

View File

@@ -1,5 +1,6 @@
import { exec, type ExecOptions, type ExecException } from 'node:child_process';
import { indentText } from './text';
import { exec } from 'child_process';
import { indentText } from '@tests/shared/Text';
import type { ExecOptions, ExecException } from 'child_process';
const TIMEOUT_IN_SECONDS = 180;
const MAX_OUTPUT_BUFFER_SIZE = 1024 * 1024; // 1 MB

View File

@@ -1,64 +1,62 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { getUrlStatus, type IRequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
import type { IUrlStatus } from './IUrlStatus';
import { getUrlStatus, type RequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlDomainProcessing';
import type { FollowOptions } from './FetchFollow';
import type { UrlStatus } from './UrlStatus';
export async function getUrlStatusesInParallel(
urls: string[],
options?: IBatchRequestOptions,
): Promise<IUrlStatus[]> {
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
// urls = ['https://privacy.sexy']; // Comment out this line to use a hardcoded URL for testing.
const uniqueUrls = Array.from(new Set(urls));
const defaultedOptions = { ...DefaultOptions, ...options };
console.log('Options: ', defaultedOptions);
const results = await request(uniqueUrls, defaultedOptions);
const defaultedDomainOptions = { ...DefaultDomainOptions, ...options?.domainOptions };
console.log('Batch request options applied:', defaultedDomainOptions);
const results = await request(uniqueUrls, defaultedDomainOptions, options);
return results;
}
export interface IBatchRequestOptions {
domainOptions?: IDomainOptions;
requestOptions?: IRequestOptions;
export interface BatchRequestOptions {
readonly domainOptions?: Partial<DomainOptions>;
readonly requestOptions?: Partial<RequestOptions>;
readonly followOptions?: Partial<FollowOptions>;
}
interface IDomainOptions {
sameDomainParallelize?: boolean;
sameDomainDelayInMs?: number;
interface DomainOptions {
readonly sameDomainParallelize?: boolean;
readonly sameDomainDelayInMs?: number;
}
const DefaultOptions: Required<IBatchRequestOptions> = {
domainOptions: {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
requestTimeoutInMs: 60 /* sec */ * 1000,
additionalHeaders: {},
},
const DefaultDomainOptions: Required<DomainOptions> = {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
};
function request(
urls: string[],
options: Required<IBatchRequestOptions>,
): Promise<IUrlStatus[]> {
if (!options.domainOptions.sameDomainParallelize) {
domainOptions: Required<DomainOptions>,
options?: BatchRequestOptions,
): Promise<UrlStatus[]> {
if (!domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelay(
urls,
(url) => getUrlStatus(url, options.requestOptions),
options.domainOptions.sameDomainDelayInMs,
(url) => getUrlStatus(url, options?.requestOptions, options?.followOptions),
domainOptions.sameDomainDelayInMs,
);
}
return Promise.all(urls.map((url) => getUrlStatus(url, options.requestOptions)));
return Promise.all(
urls.map((url) => getUrlStatus(url, options?.requestOptions, options?.followOptions)),
);
}
async function runOnEachDomainWithDelay(
urls: string[],
action: (url: string) => Promise<IUrlStatus>,
action: (url: string) => Promise<UrlStatus>,
delayInMs: number | undefined,
): Promise<IUrlStatus[]> {
): Promise<UrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<IUrlStatus>();
const results = new Array<UrlStatus>();
/* eslint-disable no-await-in-loop */
for (const url of group) {
const status = await action(url);

View File

@@ -1,27 +1,33 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import type { IUrlStatus } from './IUrlStatus';
import { indentText } from '@tests/shared/Text';
import { type UrlStatus, formatUrlStatus } from './UrlStatus';
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
export async function retryWithExponentialBackOff(
action: () => Promise<IUrlStatus>,
action: () => Promise<UrlStatus>,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
currentRetry = 1,
): Promise<IUrlStatus> {
): Promise<UrlStatus> {
const maxTries = 3;
const status = await action();
if (shouldRetry(status)) {
if (currentRetry <= maxTries) {
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
console.log([
`Attempt ${currentRetry}: Retrying in ${exponentialBackOffInMs / 1000} seconds.`,
'Details:',
indentText(formatUrlStatus(status)),
].join('\n'));
await sleep(exponentialBackOffInMs);
return retryWithExponentialBackOff(action, baseRetryIntervalInMs, currentRetry + 1);
}
console.warn('💀 All retry attempts failed. Final failure to retrieve URL:', indentText(formatUrlStatus(status)));
}
return status;
}
function shouldRetry(status: IUrlStatus) {
function shouldRetry(status: UrlStatus): boolean {
if (status.error) {
return true;
}
@@ -32,14 +38,14 @@ function shouldRetry(status: IUrlStatus) {
|| status.code === 429; // Too Many Requests
}
function isTransientError(statusCode: number) {
function isTransientError(statusCode: number): boolean {
return statusCode >= 500 && statusCode <= 599;
}
function getRetryTimeoutInMs(
currentRetry: number,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
) {
): number {
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
// of the exponentially increasing base amount
const minRandom = 1 - retryRandomFactor;

View File

@@ -1,19 +1,17 @@
import { fetchWithTimeout } from './FetchWithTimeout';
import { getDomainFromUrl } from './UrlDomainProcessing';
export function fetchFollow(
url: string,
timeoutInMs: number,
fetchOptions: RequestInit,
followOptions: IFollowOptions | undefined,
fetchOptions?: Partial<RequestInit>,
followOptions?: Partial<FollowOptions>,
): Promise<Response> {
const defaultedFollowOptions = {
...DefaultFollowOptions,
...followOptions,
};
const defaultedFollowOptions = { ...DefaultFollowOptions, ...followOptions };
if (followRedirects(defaultedFollowOptions)) {
return fetchWithTimeout(url, timeoutInMs, fetchOptions);
}
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */, mode: 'cors' };
const cookies = new CookieStorage(defaultedFollowOptions.enableCookies);
return followRecursivelyWithCookies(
url,
@@ -24,13 +22,15 @@ export function fetchFollow(
);
}
export interface IFollowOptions {
followRedirects?: boolean;
maximumRedirectFollowDepth?: number;
enableCookies?: boolean;
// "cors" | "navigate" | "no-cors" | "same-origin";
export interface FollowOptions {
readonly followRedirects?: boolean;
readonly maximumRedirectFollowDepth?: number;
readonly enableCookies?: boolean;
}
export const DefaultFollowOptions: Required<IFollowOptions> = {
const DefaultFollowOptions: Required<FollowOptions> = {
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
@@ -64,6 +64,10 @@ async function followRecursivelyWithCookies(
if (cookieHeader) {
cookies.addHeader(cookieHeader);
}
options.headers = {
...options.headers,
Host: getDomainFromUrl(nextUrl),
};
return followRecursivelyWithCookies(nextUrl, timeoutInMs, options, newFollowDepth, cookies);
}
@@ -77,7 +81,7 @@ class CookieStorage {
constructor(private readonly enabled: boolean) {
}
public hasAny() {
public hasAny(): boolean {
return this.enabled && this.cookies.length > 0;
}
@@ -88,12 +92,12 @@ class CookieStorage {
this.cookies.push(header);
}
public getHeader() {
public getHeader(): string {
return this.cookies.join(' ; ');
}
}
function followRedirects(options: IFollowOptions) {
function followRedirects(options: FollowOptions): boolean {
if (!options.followRedirects) {
return false;
}

View File

@@ -2,13 +2,13 @@ export async function fetchWithTimeout(
url: string,
timeoutInMs: number,
init?: RequestInit,
): Promise<Response> {
const controller = new AbortController();
): ReturnType<typeof fetch> {
const options: RequestInit = {
...(init ?? {}),
signal: controller.signal,
signal: AbortSignal.timeout(timeoutInMs),
};
const promise = fetch(url, options);
const timeout = setTimeout(() => controller.abort(), timeoutInMs);
return promise.finally(() => clearTimeout(timeout));
return fetch(
url,
options,
);
}

View File

@@ -1,5 +0,0 @@
export interface IUrlStatus {
url: string;
error?: string;
code?: number;
}

View File

@@ -13,7 +13,10 @@ A CLI and SDK for checking the availability of external URLs.
- 😇 **Rate Limiting**: Queues requests by domain to be polite.
- 🔁 **Retries**: Implements retry pattern with exponential back-off.
-**Timeouts**: Configurable timeout for each request.
- 🎭️ **User-Agent Rotation**: Change user agents for each request.
- 🎭️ **Impersonation**: Impersonate different browsers for each request.
- **🌐 User-Agent Rotation**: Change user agents.
- **🔑 TLS Handshakes**: Perform TLS and HTTP handshakes that are identical to that of a real browser.
- 🫙 **Cookie jar**: Preserve cookies during redirects to mimic real browser.
## CLI
@@ -54,6 +57,7 @@ const statuses = await getUrlStatusesInParallel([ 'https://privacy.sexy', /* ...
- **`sameDomainDelayInMs`** (*number*), default: `3000` (3 seconds)
- Sets the delay between requests to the same domain.
- `requestOptions` (*object*): See [request options](#request-options).
- `followOptions` (*object*): See [follow options](#follow-options).
### `getUrlStatus`
@@ -72,7 +76,6 @@ console.log(`Status code: ${status.code}`);
- The longer the base time, the greater the intervals between retries.
- **`additionalHeaders`** (*object*), default: `false`
- Additional HTTP headers to send along with the default headers. Overrides default headers if specified.
- **`followOptions`** (*object*): See [follow options](#follow-options).
- **`requestTimeoutInMs`** (*number*), default: `60000` (60 seconds)
- Time limit to abort the request if no response is received within the specified time frame.
@@ -83,19 +86,7 @@ Follows `3XX` redirects while preserving cookies.
Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter.
```js
const status = await fetchFollow('https://privacy.sexy', {
// First argument is same options as fetch API, except `redirect` options
// that's discarded in favor of next argument follow options
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
},
}, {
// Second argument sets the redirect behavior
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
}
);
const status = await fetchFollow('https://privacy.sexy', 1000 /* timeout in milliseconds */);
console.log(`Status code: ${status.code}`);
```

View File

@@ -1,70 +1,108 @@
import { indentText } from '@tests/shared/Text';
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
import { fetchFollow, type IFollowOptions, DefaultFollowOptions } from './FetchFollow';
import { fetchFollow, type FollowOptions } from './FetchFollow';
import { getRandomUserAgent } from './UserAgents';
import type { IUrlStatus } from './IUrlStatus';
import { getDomainFromUrl } from './UrlDomainProcessing';
import { randomizeTlsFingerprint, getTlsContextInfo } from './TlsFingerprintRandomizer';
import type { UrlStatus } from './UrlStatus';
export function getUrlStatus(
url: string,
options: IRequestOptions = DefaultOptions,
): Promise<IUrlStatus> {
const defaultedOptions = { ...DefaultOptions, ...options };
const fetchOptions = getFetchOptions(url, defaultedOptions);
return retryWithExponentialBackOff(async () => {
console.log('Requesting', url);
let result: IUrlStatus;
try {
const response = await fetchFollow(
url,
defaultedOptions.requestTimeoutInMs,
fetchOptions,
defaultedOptions.followOptions,
);
result = { url, code: response.status };
} catch (err) {
result = { url, error: JSON.stringify(err, null, '\t') };
}
return result;
}, defaultedOptions.retryExponentialBaseInMs);
requestOptions?: Partial<RequestOptions>,
followOptions?: Partial<FollowOptions>,
): Promise<UrlStatus> {
const defaultedOptions = getDefaultedRequestOptions(requestOptions);
if (defaultedOptions.randomizeTlsFingerprint) {
randomizeTlsFingerprint();
}
return fetchUrlStatusWithRetry(url, defaultedOptions, followOptions);
}
export interface IRequestOptions {
export interface RequestOptions {
readonly retryExponentialBaseInMs?: number;
readonly additionalHeaders?: Record<string, string>;
readonly additionalHeadersUrlIgnore?: string[];
readonly followOptions?: IFollowOptions;
readonly requestTimeoutInMs: number;
readonly randomizeTlsFingerprint: boolean;
}
const DefaultOptions: Required<IRequestOptions> = {
retryExponentialBaseInMs: 5000,
const DefaultOptions: Required<RequestOptions> = {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
additionalHeaders: {},
additionalHeadersUrlIgnore: [],
requestTimeoutInMs: 60 /* seconds */ * 1000,
followOptions: DefaultFollowOptions,
randomizeTlsFingerprint: true,
};
function getFetchOptions(url: string, options: Required<IRequestOptions>): RequestInit {
function fetchUrlStatusWithRetry(
url: string,
requestOptions: Required<RequestOptions>,
followOptions?: Partial<FollowOptions>,
): Promise<UrlStatus> {
const fetchOptions = getFetchOptions(url, requestOptions);
return retryWithExponentialBackOff(async () => {
console.log(`Initiating request for URL: ${url}`);
let result: UrlStatus;
try {
const response = await fetchFollow(
url,
requestOptions.requestTimeoutInMs,
fetchOptions,
followOptions,
);
result = { url, code: response.status };
} catch (err) {
result = {
url,
error: [
'Error:', indentText(JSON.stringify(err, null, '\t') || err.toString()),
'Options:', indentText(JSON.stringify(fetchOptions, null, '\t')),
'TLS:', indentText(getTlsContextInfo()),
].join('\n'),
};
}
return result;
}, requestOptions.retryExponentialBaseInMs);
}
function getFetchOptions(url: string, options: Required<RequestOptions>): RequestInit {
const additionalHeaders = options.additionalHeadersUrlIgnore
.some((ignorePattern) => url.startsWith(ignorePattern))
? {}
: options.additionalHeaders;
return {
method: 'HEAD',
method: 'GET', // Fetch only headers without the full response body for better speed
headers: {
...getDefaultHeaders(),
...getDefaultHeaders(url),
...additionalHeaders,
},
redirect: 'manual', // Redirects are handled manually, automatic redirects do not work with Host header
};
}
function getDefaultHeaders(): Record<string, string> {
function getDefaultHeaders(url: string): Record<string, string> {
return {
'user-agent': getRandomUserAgent(),
'upgrade-insecure-requests': '1',
connection: 'keep-alive',
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'accept-language': 'en-US,en;q=0.9',
// Needed for websites that filter out non-browser user agents.
'User-Agent': getRandomUserAgent(),
// Required for some websites, especially those behind proxies, to correctly handle the request.
Host: getDomainFromUrl(url),
// The following mimic a real browser request to improve compatibility with most web servers.
'Upgrade-Insecure-Requests': '1',
Connection: 'keep-alive',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'max-age=0',
'Accept-Language': 'en-US,en;q=0.9',
};
}
function getDefaultedRequestOptions(
options?: Partial<RequestOptions>,
): Required<RequestOptions> {
return {
...DefaultOptions,
...options,
};
}

View File

@@ -0,0 +1,69 @@
/**
* Modifies the TLS fingerprint of Node.js HTTP client to circumvent TLS fingerprinting blocks.
* TLS fingerprinting is a technique used to identify clients based on the unencrypted data sent
* during the TLS handshake, used for blocking or identifying non-browser clients like debugging
* proxies or automated scripts.
*
* However, Node.js's HTTP client does not fully support all methods required for impersonating a
* browser's TLS fingerprint, as reported in https://github.com/nodejs/undici/issues/1983.
* While this implementation can alter the TLS fingerprint by randomizing the cipher suite order,
* it may not perfectly mimic specific browser fingerprints due to limitations in the TLS
* implementation of Node.js.
*
* For more detailed information, visit:
* - https://archive.today/2024.03.13-102042/https://httptoolkit.com/blog/tls-fingerprinting-node-js/
* - https://check.ja3.zone/ (To check your tool's or browser's fingerprint)
* - https://github.com/lwthiker/curl-impersonate (A solution for curl)
* - https://github.com/depicts/got-tls (Cipher manipulation support for Node.js)
*/
import { constants } from 'crypto';
import tls from 'tls';
import { indentText } from '@tests/shared/Text';
export function randomizeTlsFingerprint() {
tls.DEFAULT_CIPHERS = getShuffledCiphers().join(':');
console.log(
[
'Original ciphers:', indentText(constants.defaultCipherList),
'Current context', indentText(getTlsContextInfo()),
].join('\n'),
);
}
export function getTlsContextInfo(): string {
return [
`Ciphers: ${tls.DEFAULT_CIPHERS}`,
`Minimum TLS protocol version: ${tls.DEFAULT_MIN_VERSION}`,
`Node fingerprint: ${constants.defaultCoreCipherList === tls.DEFAULT_CIPHERS ? 'Visible' : 'Masked'}`,
].join('\n');
}
/**
* Shuffles the order of TLS ciphers, excluding the top 3 most important ciphers to maintain
* security preferences. This approach modifies the default cipher list of Node.js to create a
* unique TLS fingerprint, thus helping to circumvent detection mechanisms based on static
* fingerprinting. It leverages randomness in the cipher order as a simple method to generate a
* new, unique TLS fingerprint which is not easily identifiable. The technique is based on altering
* parameters used in the TLS handshake process, particularly the cipher suite order, to avoid
* matching known fingerprints that could identify the client as a Node.js application.
*
* For more details, refer to:
* - https://archive.today/2024.03.13-102234/https://getsetfetch.org/blog/tls-fingerprint.html
*/
export function getShuffledCiphers(): readonly string[] {
const nodeOrderedCipherList = constants.defaultCoreCipherList.split(':');
const totalTopCiphersToKeep = 3;
// Keep the most important ciphers in the same order
const fixedCiphers = nodeOrderedCipherList.slice(0, totalTopCiphersToKeep);
// Shuffle the rest
const shuffledCiphers = nodeOrderedCipherList.slice(totalTopCiphersToKeep)
.map((cipher) => ({ cipher, sort: Math.random() }))
.sort((a, b) => a.sort - b.sort)
.map(({ cipher }) => cipher);
const ciphers = [
...fixedCiphers,
...shuffledCiphers,
];
return ciphers;
}

View File

@@ -2,18 +2,18 @@ export function groupUrlsByDomain(urls: string[]): string[][] {
const domains = new Set<string>();
const urlsWithDomain = urls.map((url) => ({
url,
domain: extractDomain(url),
domain: getDomainFromUrl(url),
}));
for (const url of urlsWithDomain) {
domains.add(url.domain);
}
return Array.from(domains).map((domain) => {
return urlsWithDomain
.filter((url) => url.domain === domain)
.filter((url) => url.domain.toLowerCase() === domain.toLowerCase())
.map((url) => url.url);
});
}
function extractDomain(url: string): string {
return url.split('://')[1].split('/')[0].toLowerCase();
export function getDomainFromUrl(url: string): string {
return new URL(url).host;
}

View File

@@ -0,0 +1,19 @@
import { indentText } from '@tests/shared/Text';
export interface UrlStatus {
readonly url: string;
readonly error?: string;
readonly code?: number;
}
export function formatUrlStatus(status: UrlStatus): string {
return [
`URL: ${status.url}`,
...status.code !== undefined ? [
`Response code: ${status.code}`,
] : [],
...status.error ? [
`Error:\n${indentText(status.error)}`,
] : [],
].join('\n');
}

View File

@@ -3,73 +3,28 @@ export function getRandomUserAgent(): string {
}
const UserAgents = [
// Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/604.1',
// Internet Explorer
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 Edge/15.0',
// Opera
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
// iOS Devices
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/18.2b11866 Mobile/16B91 Safari/605.1.15',
'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1',
// Android Devices
'Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.3',
// Other Devices/Browsers
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.3 Edge/15.0',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.3',
'Mozilla/5.0 (Linux; Android 7.0; SM-G930F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.3 OPR/53.0.2907.99',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20120121 Firefox/46.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; Tablet PC 2.0)',
'Mozilla/5.0 (Windows NT 5.1; rv:36.0) Gecko/20100101 Firefox/36.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/20161202 Firefox/21.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (X11; CrOS x86_64 4319.74.0) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
// Safari 17.1 - macOS and iPad
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
// Safari - iOS 17 - iPhone
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
// Safari - iOS 17 - iPad mini
'Mozilla/5.0 (iPad; CPU OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
// Edge - macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
// Edge - Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58',
// Edge - Android
'Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.43 Mobile Safari/537.36 EdgA/119.0.2151.92',
// Chrome - macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
// Chrome - Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
// Chrome - Android (Phone)
'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
// Firefox - macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/114.0',
// Firefox - Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0,',
// Firefox - Android (Phone)
'Mozilla/5.0 (Android 14; Mobile; rv:109.0) Gecko/120.0 Firefox/120.0',
];

View File

@@ -1,50 +1,82 @@
import { test, expect } from 'vitest';
import { parseApplication } from '@/application/Parser/ApplicationParser';
import type { IApplication } from '@/domain/IApplication';
import { getUrlStatusesInParallel, type IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
import type { IUrlStatus } from './StatusChecker/IUrlStatus';
import { indentText } from '@tests/shared/Text';
import { formatAssertionMessage } from '@tests/shared/FormatAssertionMessage';
import { type UrlStatus, formatUrlStatus } from './StatusChecker/UrlStatus';
import { getUrlStatusesInParallel, type BatchRequestOptions } from './StatusChecker/BatchStatusChecker';
// arrange
const app = parseApplication();
const urls = collectUniqueUrls(app);
const requestOptions: IBatchRequestOptions = {
const urls = collectUniqueUrls({
application: app,
excludePatterns: [
/^https:\/\/archive\.ph/, // Drops HEAD/GET requests via fetch/curl, responding to Postman/Chromium.
],
});
const requestOptions: BatchRequestOptions = {
domainOptions: {
sameDomainParallelize: false, // be nice to our external servers
sameDomainParallelize: false, // be nice to our third-party servers
sameDomainDelayInMs: 5 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 3 /* sec */ * 1000,
requestTimeoutInMs: 60 /* sec */ * 1000,
additionalHeaders: { referer: app.projectDetails.homepage },
randomizeTlsFingerprint: true,
},
followOptions: {
followRedirects: true,
enableCookies: true,
},
};
const testTimeoutInMs = urls.length * 60 /* seconds */ * 1000;
test(`all URLs (${urls.length}) should be alive`, async () => {
// act
const results = await getUrlStatusesInParallel(urls, requestOptions);
const deadUrls = results.filter((r) => r.code !== 200);
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
// assert
const deadUrls = results.filter((r) => r.code === undefined || !isOkStatusCode(r.code));
expect(deadUrls).to.have.lengthOf(0, formatAssertionMessage([formatUrlStatusReport(deadUrls)]));
}, testTimeoutInMs);
function collectUniqueUrls(application: IApplication): string[] {
function isOkStatusCode(statusCode: number): boolean {
return statusCode >= 200 && statusCode < 300;
}
function collectUniqueUrls(
options: {
readonly application: IApplication,
readonly excludePatterns?: readonly RegExp[],
},
): string[] {
return [ // Get all nodes
...application.collections.flatMap((c) => c.getAllCategories()),
...application.collections.flatMap((c) => c.getAllScripts()),
...options.application.collections.flatMap((c) => c.getAllCategories()),
...options.application.collections.flatMap((c) => c.getAllScripts()),
]
// Get all docs
.flatMap((documentable) => documentable.docs)
// Parse all URLs
.flatMap((docString) => docString.match(/(https?:\/\/[^\s`"<>()]+)/g) || [])
.flatMap((docString) => extractUrls(docString))
// Remove duplicates
.filter((url, index, array) => array.indexOf(url) === index);
.filter((url, index, array) => array.indexOf(url) === index)
// Exclude certain URLs based on patterns
.filter((url) => !shouldExcludeUrl(url, options.excludePatterns ?? []));
}
function printUrls(statuses: IUrlStatus[]): string {
/* eslint-disable prefer-template */
return '\n'
+ statuses.map((status) => `- ${status.url}\n`
+ (status.code ? `\tResponse code: ${status.code}` : '')
+ (status.error ? `\tError: ${status.error}` : ''))
.join('\n')
+ '\n';
/* eslint-enable prefer-template */
function shouldExcludeUrl(url: string, patterns: readonly RegExp[]): boolean {
return patterns.some((pattern) => pattern.test(url));
}
function formatUrlStatusReport(deadUrlStatuses: readonly UrlStatus[]): string {
return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
}
function extractUrls(textWithInlineCode: string): string[] {
/*
Matches all URLs.
Inline code blocks contain URLs not intended for user interaction and not
guaranteed to support expected HTTP methods, leading to false-negatives.
*/
const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+)/g;
return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
}