Refactor and improve external URL checks

- Move external URL checks to its own module under `tests/`. This
  separates them from integration test, addressing long runs and
  frequent failures that led to ignoring test results.
- Move `check-desktop-runtime-errors` to `tests/checks` to keep all
  test-related checks into one directory.
- Replace `ts-node` with `vite` for running
  `check-desktop-runtime-errors` to maintain a consistent execution
  environment across checks.
- Implement a timeout for each fetch call.
- Be nice to external sources, wait 5 seconds before sending another
  request to an URL under same domain. This solves rate-limiting issues.
- Instead of running test on every push/pull request, run them only
  weekly.
- Do not run tests on each commit/PR but only scheduled (weekly) to
  minimize noise.
- Fix URLs are not captured correctly inside backticks or parenthesis.
This commit is contained in:
undergroundwires
2023-09-01 00:18:47 +02:00
parent f4d86fccfd
commit 19e42c9c52
43 changed files with 400 additions and 449 deletions

View File

@@ -0,0 +1,75 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { IUrlStatus } from './IUrlStatus';
import { getUrlStatus, IRequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
export async function getUrlStatusesInParallel(
urls: string[],
options?: IBatchRequestOptions,
): Promise<IUrlStatus[]> {
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
const uniqueUrls = Array.from(new Set(urls));
options = { ...DefaultOptions, ...options };
console.log('Options: ', options);
const results = await request(uniqueUrls, options);
return results;
}
export interface IBatchRequestOptions {
domainOptions?: IDomainOptions;
requestOptions?: IRequestOptions;
}
interface IDomainOptions {
sameDomainParallelize?: boolean;
sameDomainDelayInMs?: number;
}
const DefaultOptions: IBatchRequestOptions = {
domainOptions: {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
requestTimeoutInMs: 60 /* sec */ * 1000,
additionalHeaders: {},
},
};
function request(
urls: string[],
options: IBatchRequestOptions,
): Promise<IUrlStatus[]> {
if (!options.domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelay(
urls,
(url) => getUrlStatus(url, options.requestOptions),
options.domainOptions.sameDomainDelayInMs,
);
}
return Promise.all(urls.map((url) => getUrlStatus(url, options.requestOptions)));
}
async function runOnEachDomainWithDelay(
urls: string[],
action: (url: string) => Promise<IUrlStatus>,
delayInMs: number,
): Promise<IUrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<IUrlStatus>();
/* eslint-disable no-await-in-loop */
for (const url of group) {
const status = await action(url);
results.push(status);
if (results.length !== group.length) {
await sleep(delayInMs);
}
}
/* eslint-enable no-await-in-loop */
return results;
});
const r = await Promise.all(tasks);
return r.flat();
}

View File

@@ -0,0 +1,48 @@
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
import { IUrlStatus } from './IUrlStatus';
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
export async function retryWithExponentialBackOff(
action: () => Promise<IUrlStatus>,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
currentRetry = 1,
): Promise<IUrlStatus> {
const maxTries = 3;
const status = await action();
if (shouldRetry(status)) {
if (currentRetry <= maxTries) {
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
// tslint:disable-next-line: no-console
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
await sleep(exponentialBackOffInMs);
return retryWithExponentialBackOff(action, baseRetryIntervalInMs, currentRetry + 1);
}
}
return status;
}
function shouldRetry(status: IUrlStatus) {
if (status.error) {
return true;
}
return isTransientError(status.code)
|| status.code === 429; // Too Many Requests
}
function isTransientError(statusCode: number) {
return statusCode >= 500 && statusCode <= 599;
}
function getRetryTimeoutInMs(
currentRetry: number,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
) {
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
// of the exponentially increasing base amount
const minRandom = 1 - retryRandomFactor;
const maxRandom = 1 + retryRandomFactor;
const randomization = (Math.random() * (maxRandom - minRandom)) + maxRandom;
const exponential = 2 ** (currentRetry - 1);
return Math.ceil(exponential * baseRetryIntervalInMs * randomization);
}

View File

@@ -0,0 +1,107 @@
import { fetchWithTimeout } from './FetchWithTimeout';
export function fetchFollow(
url: string,
timeoutInMs: number,
fetchOptions: RequestInit,
followOptions: IFollowOptions,
): Promise<Response> {
followOptions = { ...DefaultOptions, ...followOptions };
if (followRedirects(followOptions)) {
return fetchWithTimeout(url, timeoutInMs, fetchOptions);
}
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
const cookies = new CookieStorage(followOptions.enableCookies);
return followRecursivelyWithCookies(
url,
timeoutInMs,
fetchOptions,
followOptions.maximumRedirectFollowDepth,
cookies,
);
}
export interface IFollowOptions {
followRedirects?: boolean;
maximumRedirectFollowDepth?: number;
enableCookies?: boolean;
}
const DefaultOptions: IFollowOptions = {
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
};
async function followRecursivelyWithCookies(
url: string,
timeoutInMs: number,
options: RequestInit,
followDepth: number,
cookies: CookieStorage,
): Promise<Response> {
options = updateCookieHeader(cookies, options);
const response = await fetchWithTimeout(
url,
timeoutInMs,
options,
);
if (!isRedirect(response.status)) {
return response;
}
const newFollowDepth = followDepth - 1;
if (newFollowDepth < 0) {
throw new Error(`[max-redirect] maximum redirect reached at: ${url}`);
}
const cookieHeader = response.headers.get('set-cookie');
cookies.addHeader(cookieHeader);
const nextUrl = response.headers.get('location');
return followRecursivelyWithCookies(nextUrl, timeoutInMs, options, newFollowDepth, cookies);
}
function isRedirect(code: number): boolean {
return code === 301 || code === 302 || code === 303 || code === 307 || code === 308;
}
class CookieStorage {
public cookies = new Array<string>();
constructor(private readonly enabled: boolean) {
}
public hasAny() {
return this.enabled && this.cookies.length > 0;
}
public addHeader(header: string) {
if (!this.enabled || !header) {
return;
}
this.cookies.push(header);
}
public getHeader() {
return this.cookies.join(' ; ');
}
}
function followRedirects(options: IFollowOptions) {
if (!options.followRedirects) {
return false;
}
if (options.maximumRedirectFollowDepth === 0) {
return false;
}
return true;
}
function updateCookieHeader(
cookies: CookieStorage,
options: RequestInit,
): RequestInit {
if (!cookies.hasAny()) {
return options;
}
const newOptions = { ...options, headers: { ...options.headers, cookie: cookies.getHeader() } };
return newOptions;
}

View File

@@ -0,0 +1,16 @@
import fetch from 'cross-fetch';
export async function fetchWithTimeout(
url: string,
timeoutInMs: number,
init?: RequestInit,
): Promise<Response> {
const controller = new AbortController();
const options: RequestInit = {
...(init ?? {}),
signal: controller.signal,
};
const promise = fetch(url, options);
const timeout = setTimeout(() => controller.abort(), timeoutInMs);
return promise.finally(() => clearTimeout(timeout));
}

View File

@@ -0,0 +1,5 @@
export interface IUrlStatus {
url: string;
error?: string;
code?: number;
}

View File

@@ -0,0 +1,111 @@
# status-checker
A CLI and SDK for checking the availability of external URLs.
🧐 Why?
- 🏃 **Fast**: Batch checks the statuses of URLs in parallel.
- 🤖 **Easy-to-Use**: Zero-touch startup with pre-configured settings for reliable results, yet customizable.
- 🤞 **Reliable**: Mimics real web browser behavior by following redirects and maintaining cookie storage.
🍭 Additional features
- 😇 **Rate Limiting**: Queues requests by domain to be polite.
- 🔁 **Retries**: Implements retry pattern with exponential back-off.
-**Timeouts**: Configurable timeout for each request.
- 🎭️ **User-Agent Rotation**: Change user agents for each request.
## CLI
Coming soon 🚧
## Programmatic usage
The SDK supports both Node.js and browser environments.
### `getUrlStatusesInParallel`
```js
// Simple example
const statuses = await getUrlStatusesInParallel([ 'https://privacy.sexy', /* ... */ ]);
if(statuses.all((r) => r.code === 200)) {
console.log('All URLs are alive!');
} else {
console.log('Dead URLs:', statuses.filter((r) => r.code !== 200).map((r) => r.url));
}
// Fastest configuration
const statuses = await getUrlStatusesInParallel([ 'https://privacy.sexy', /* ... */ ], {
domainOptions: {
sameDomainParallelize: false,
}
});
```
#### Batch request options
- `domainOptions`:
- **`sameDomainParallelize`**, (*boolean*), default: `false`
- Determines if requests to the same domain will be parallelized.
- Setting to `false` makes all requests parallel.
- Setting to `true` queues requests for each unique domain while parallelizing across different domains.
- Requests to different domains are always parallelized regardless of this option.
- 💡 This helps to avoid `429 Too Many Requests` and be nice to websites
- **`sameDomainDelayInMs`** (*number*), default: `3000` (3 seconds)
- Sets the delay between requests to the same domain.
- `requestOptions` (*object*): See [request options](#request-options).
### `getUrlStatus`
Check the availability of a single URL.
```js
// Simple example
const status = await getUrlStatus('https://privacy.sexy');
console.log(`Status code: ${status.code}`);
```
#### Request options
- **`retryExponentialBaseInMs`** (*number*), default: `5000` (5 seconds)
- Base time for the exponential back-off calculation for retries.
- The longer the base time, the greater the intervals between retries.
- **`additionalHeaders`** (*object*), default: `false`
- Additional HTTP headers to send along with the default headers. Overrides default headers if specified.
- **`followOptions`** (*object*): See [follow options](#follow-options).
- **`requestTimeoutInMs`** (*number*), default: `60000` (60 seconds)
- Time limit to abort the request if no response is received within the specified time frame.
### `fetchFollow`
Follows `3XX` redirects while preserving cookies.
Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter.
```js
const status = await fetchFollow('https://privacy.sexy', {
// First argument is same options as fetch API, except `redirect` options
// that's discarded in favor of next argument follow options
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
},
}, {
// Second argument sets the redirect behavior
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
}
);
console.log(`Status code: ${status.code}`);
```
#### Follow options
- **`followRedirects`** (*boolean*), default: `true`
- Determines whether or not to follow redirects with `3XX` response codes.
- **`maximumRedirectFollowDepth`** (*boolean*), default: `20`
- Specifies the maximum number of sequential redirects that the function will follow.
- 💡 Helps to solve maximum redirect reached errors.
- **`enableCookies`** (*boolean*), default: `true`
- Enables cookie storage to facilitate seamless navigation through login or other authentication challenges.
- 💡 Helps to over-come sign-in challenges with callbacks.

View File

@@ -0,0 +1,69 @@
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
import { IUrlStatus } from './IUrlStatus';
import { fetchFollow, IFollowOptions } from './FetchFollow';
import { getRandomUserAgent } from './UserAgents';
export function getUrlStatus(
url: string,
options: IRequestOptions = DefaultOptions,
): Promise<IUrlStatus> {
options = { ...DefaultOptions, ...options };
const fetchOptions = getFetchOptions(url, options);
return retryWithExponentialBackOff(async () => {
console.log('Requesting', url);
let result: IUrlStatus;
try {
const response = await fetchFollow(
url,
options.requestTimeoutInMs,
fetchOptions,
options.followOptions,
);
result = { url, code: response.status };
} catch (err) {
result = { url, error: JSON.stringify(err, null, '\t') };
}
return result;
}, options.retryExponentialBaseInMs);
}
export interface IRequestOptions {
retryExponentialBaseInMs?: number;
additionalHeaders?: Record<string, string>;
additionalHeadersUrlIgnore?: string[];
followOptions?: IFollowOptions;
requestTimeoutInMs: number;
}
const DefaultOptions: IRequestOptions = {
retryExponentialBaseInMs: 5000,
additionalHeaders: {},
additionalHeadersUrlIgnore: [],
requestTimeoutInMs: 60 /* seconds */ * 1000,
};
function getFetchOptions(url: string, options: IRequestOptions): RequestInit {
const additionalHeaders = options.additionalHeadersUrlIgnore
.some((ignorePattern) => url.startsWith(ignorePattern))
? {}
: options.additionalHeaders;
return {
method: 'HEAD',
headers: {
...getDefaultHeaders(),
...additionalHeaders,
},
};
}
function getDefaultHeaders(): Record<string, string> {
return {
'user-agent': getRandomUserAgent(),
'upgrade-insecure-requests': '1',
connection: 'keep-alive',
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'accept-language': 'en-US,en;q=0.9',
};
}

View File

@@ -0,0 +1,19 @@
export function groupUrlsByDomain(urls: string[]): string[][] {
const domains = new Set<string>();
const urlsWithDomain = urls.map((url) => ({
url,
domain: extractDomain(url),
}));
for (const url of urlsWithDomain) {
domains.add(url.domain);
}
return Array.from(domains).map((domain) => {
return urlsWithDomain
.filter((url) => url.domain === domain)
.map((url) => url.url);
});
}
function extractDomain(url: string): string {
return url.split('://')[1].split('/')[0].toLowerCase();
}

View File

@@ -0,0 +1,75 @@
export function getRandomUserAgent(): string {
return UserAgents[Math.floor(Math.random() * UserAgents.length)];
}
const UserAgents = [
// Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/604.1',
// Internet Explorer
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 Edge/15.0',
// Opera
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
// iOS Devices
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/18.2b11866 Mobile/16B91 Safari/605.1.15',
'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1',
// Android Devices
'Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.3',
// Other Devices/Browsers
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.3 Edge/15.0',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.3',
'Mozilla/5.0 (Linux; Android 7.0; SM-G930F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.3 OPR/53.0.2907.99',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20120121 Firefox/46.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; Tablet PC 2.0)',
'Mozilla/5.0 (Windows NT 5.1; rv:36.0) Gecko/20100101 Firefox/36.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/20161202 Firefox/21.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.3',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (X11; CrOS x86_64 4319.74.0) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.3 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.3',
];