Refactor and improve external URL checks
- Move external URL checks to its own module under `tests/`. This separates them from integration test, addressing long runs and frequent failures that led to ignoring test results. - Move `check-desktop-runtime-errors` to `tests/checks` to keep all test-related checks into one directory. - Replace `ts-node` with `vite` for running `check-desktop-runtime-errors` to maintain a consistent execution environment across checks. - Implement a timeout for each fetch call. - Be nice to external sources, wait 5 seconds before sending another request to an URL under same domain. This solves rate-limiting issues. - Instead of running test on every push/pull request, run them only weekly. - Do not run tests on each commit/PR but only scheduled (weekly) to minimize noise. - Fix URLs are not captured correctly inside backticks or parenthesis.
This commit is contained in:
@@ -1,56 +0,0 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { parseApplication } from '@/application/Parser/ApplicationParser';
|
||||
import { IApplication } from '@/domain/IApplication';
|
||||
import { IUrlStatus } from './StatusChecker/IUrlStatus';
|
||||
import { getUrlStatusesInParallel, IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
|
||||
|
||||
describe('collections', () => {
|
||||
// arrange
|
||||
const app = parseApplication();
|
||||
const urls = collectUniqueUrls(app);
|
||||
const options: IBatchRequestOptions = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: true, // no need to be so nice until sources start failing
|
||||
// sameDomainDelayInMs: 2 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 3 /* sec */ * 1000,
|
||||
additionalHeaders: { referer: app.info.homepage },
|
||||
additionalHeadersUrlIgnore: [
|
||||
'http://batcmd.com/', // Otherwise it responds with 403
|
||||
],
|
||||
},
|
||||
};
|
||||
const testTimeoutInMs = urls.length * 60 /* minutes */ * 1000;
|
||||
it('have no dead urls', async () => {
|
||||
// act
|
||||
const results = await getUrlStatusesInParallel(urls, options);
|
||||
// assert
|
||||
const deadUrls = results.filter((r) => r.code !== 200);
|
||||
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
|
||||
}, testTimeoutInMs);
|
||||
});
|
||||
|
||||
function collectUniqueUrls(app: IApplication): string[] {
|
||||
return [ // Get all nodes
|
||||
...app.collections.flatMap((c) => c.getAllCategories()),
|
||||
...app.collections.flatMap((c) => c.getAllScripts()),
|
||||
]
|
||||
// Get all docs
|
||||
.flatMap((documentable) => documentable.docs)
|
||||
// Parse all URLs
|
||||
.flatMap((docString) => docString.match(/(https?:\/\/[^\s]+)/g) || [])
|
||||
// Remove duplicates
|
||||
.filter((url, index, array) => array.indexOf(url) === index);
|
||||
}
|
||||
|
||||
function printUrls(statuses: IUrlStatus[]): string {
|
||||
/* eslint-disable prefer-template */
|
||||
return '\n'
|
||||
+ statuses.map((status) => `- ${status.url}\n`
|
||||
+ (status.code ? `\tResponse code: ${status.code}` : '')
|
||||
+ (status.error ? `\tError: ${status.error}` : ''))
|
||||
.join('\n')
|
||||
+ '\n';
|
||||
/* eslint-enable prefer-template */
|
||||
}
|
||||
@@ -1,74 +0,0 @@
|
||||
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
import { getUrlStatus, IRequestOptions } from './Requestor';
|
||||
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
|
||||
|
||||
export async function getUrlStatusesInParallel(
|
||||
urls: string[],
|
||||
options?: IBatchRequestOptions,
|
||||
): Promise<IUrlStatus[]> {
|
||||
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
|
||||
const uniqueUrls = Array.from(new Set(urls));
|
||||
options = { ...DefaultOptions, ...options };
|
||||
console.log('Options: ', options);
|
||||
const results = await request(uniqueUrls, options);
|
||||
return results;
|
||||
}
|
||||
|
||||
export interface IBatchRequestOptions {
|
||||
domainOptions?: IDomainOptions;
|
||||
requestOptions?: IRequestOptions;
|
||||
}
|
||||
|
||||
interface IDomainOptions {
|
||||
sameDomainParallelize?: boolean;
|
||||
sameDomainDelayInMs?: number;
|
||||
}
|
||||
|
||||
const DefaultOptions: IBatchRequestOptions = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: false,
|
||||
sameDomainDelayInMs: 3 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 5 /* sec */ * 1000,
|
||||
additionalHeaders: {},
|
||||
},
|
||||
};
|
||||
|
||||
function request(
|
||||
urls: string[],
|
||||
options: IBatchRequestOptions,
|
||||
): Promise<IUrlStatus[]> {
|
||||
if (!options.domainOptions.sameDomainParallelize) {
|
||||
return runOnEachDomainWithDelay(
|
||||
urls,
|
||||
(url) => getUrlStatus(url, options.requestOptions),
|
||||
options.domainOptions.sameDomainDelayInMs,
|
||||
);
|
||||
}
|
||||
return Promise.all(urls.map((url) => getUrlStatus(url, options.requestOptions)));
|
||||
}
|
||||
|
||||
async function runOnEachDomainWithDelay(
|
||||
urls: string[],
|
||||
action: (url: string) => Promise<IUrlStatus>,
|
||||
delayInMs: number,
|
||||
): Promise<IUrlStatus[]> {
|
||||
const grouped = groupUrlsByDomain(urls);
|
||||
const tasks = grouped.map(async (group) => {
|
||||
const results = new Array<IUrlStatus>();
|
||||
/* eslint-disable no-await-in-loop */
|
||||
for (const url of group) {
|
||||
const status = await action(url);
|
||||
results.push(status);
|
||||
if (results.length !== group.length) {
|
||||
await sleep(delayInMs);
|
||||
}
|
||||
}
|
||||
/* eslint-enable no-await-in-loop */
|
||||
return results;
|
||||
});
|
||||
const r = await Promise.all(tasks);
|
||||
return r.flat();
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
import { sleep } from '@/infrastructure/Threading/AsyncSleep';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
|
||||
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
|
||||
|
||||
export async function retryWithExponentialBackOff(
|
||||
action: () => Promise<IUrlStatus>,
|
||||
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
|
||||
currentRetry = 1,
|
||||
): Promise<IUrlStatus> {
|
||||
const maxTries = 3;
|
||||
const status = await action();
|
||||
if (shouldRetry(status)) {
|
||||
if (currentRetry <= maxTries) {
|
||||
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
|
||||
// tslint:disable-next-line: no-console
|
||||
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
|
||||
await sleep(exponentialBackOffInMs);
|
||||
return retryWithExponentialBackOff(action, baseRetryIntervalInMs, currentRetry + 1);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
function shouldRetry(status: IUrlStatus) {
|
||||
if (status.error) {
|
||||
return true;
|
||||
}
|
||||
return isTransientError(status.code)
|
||||
|| status.code === 429; // Too Many Requests
|
||||
}
|
||||
|
||||
function isTransientError(statusCode: number) {
|
||||
return statusCode >= 500 && statusCode <= 599;
|
||||
}
|
||||
|
||||
function getRetryTimeoutInMs(
|
||||
currentRetry: number,
|
||||
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
|
||||
) {
|
||||
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
|
||||
// of the exponentially increasing base amount
|
||||
const minRandom = 1 - retryRandomFactor;
|
||||
const maxRandom = 1 + retryRandomFactor;
|
||||
const randomization = (Math.random() * (maxRandom - minRandom)) + maxRandom;
|
||||
const exponential = 2 ** (currentRetry - 1);
|
||||
return Math.ceil(exponential * baseRetryIntervalInMs * randomization);
|
||||
}
|
||||
@@ -1,100 +0,0 @@
|
||||
import fetch from 'cross-fetch';
|
||||
|
||||
export function fetchFollow(
|
||||
url: string,
|
||||
fetchOptions: RequestInit,
|
||||
followOptions: IFollowOptions,
|
||||
): Promise<Response> {
|
||||
followOptions = { ...DefaultOptions, ...followOptions };
|
||||
if (followRedirects(followOptions)) {
|
||||
return fetch(url, fetchOptions);
|
||||
}
|
||||
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
|
||||
const cookies = new CookieStorage(followOptions.enableCookies);
|
||||
return followRecursivelyWithCookies(
|
||||
url,
|
||||
fetchOptions,
|
||||
followOptions.maximumRedirectFollowDepth,
|
||||
cookies,
|
||||
);
|
||||
}
|
||||
|
||||
export interface IFollowOptions {
|
||||
followRedirects?: boolean;
|
||||
maximumRedirectFollowDepth?: number;
|
||||
enableCookies?: boolean;
|
||||
}
|
||||
|
||||
const DefaultOptions: IFollowOptions = {
|
||||
followRedirects: true,
|
||||
maximumRedirectFollowDepth: 20,
|
||||
enableCookies: true,
|
||||
};
|
||||
|
||||
async function followRecursivelyWithCookies(
|
||||
url: string,
|
||||
options: RequestInit,
|
||||
followDepth: number,
|
||||
cookies: CookieStorage,
|
||||
): Promise<Response> {
|
||||
options = updateCookieHeader(cookies, options);
|
||||
const response = await fetch(url, options);
|
||||
if (!isRedirect(response.status)) {
|
||||
return response;
|
||||
}
|
||||
const newFollowDepth = followDepth - 1;
|
||||
if (newFollowDepth < 0) {
|
||||
throw new Error(`[max-redirect] maximum redirect reached at: ${url}`);
|
||||
}
|
||||
const cookieHeader = response.headers.get('set-cookie');
|
||||
cookies.addHeader(cookieHeader);
|
||||
const nextUrl = response.headers.get('location');
|
||||
return followRecursivelyWithCookies(nextUrl, options, newFollowDepth, cookies);
|
||||
}
|
||||
|
||||
function isRedirect(code: number): boolean {
|
||||
return code === 301 || code === 302 || code === 303 || code === 307 || code === 308;
|
||||
}
|
||||
|
||||
class CookieStorage {
|
||||
public cookies = new Array<string>();
|
||||
|
||||
constructor(private readonly enabled: boolean) {
|
||||
}
|
||||
|
||||
public hasAny() {
|
||||
return this.enabled && this.cookies.length > 0;
|
||||
}
|
||||
|
||||
public addHeader(header: string) {
|
||||
if (!this.enabled || !header) {
|
||||
return;
|
||||
}
|
||||
this.cookies.push(header);
|
||||
}
|
||||
|
||||
public getHeader() {
|
||||
return this.cookies.join(' ; ');
|
||||
}
|
||||
}
|
||||
|
||||
function followRedirects(options: IFollowOptions) {
|
||||
if (!options.followRedirects) {
|
||||
return false;
|
||||
}
|
||||
if (options.maximumRedirectFollowDepth === 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function updateCookieHeader(
|
||||
cookies: CookieStorage,
|
||||
options: RequestInit,
|
||||
): RequestInit {
|
||||
if (!cookies.hasAny()) {
|
||||
return options;
|
||||
}
|
||||
const newOptions = { ...options, headers: { ...options.headers, cookie: cookies.getHeader() } };
|
||||
return newOptions;
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
export interface IUrlStatus {
|
||||
url: string;
|
||||
error?: string;
|
||||
code?: number;
|
||||
}
|
||||
@@ -1,108 +0,0 @@
|
||||
# status-checker
|
||||
|
||||
CLI and SDK to check whether an external URL is alive.
|
||||
|
||||
🧐 Why?
|
||||
|
||||
- 🏃🏻 Batch checking status of URLs in parallel.
|
||||
- 🤖 Zero-touch start, pre-configured for reliable results, still configurable.
|
||||
- 🤞 Reliable, mimics a real web browser by following redirect, and cookie storage.
|
||||
|
||||
🍭 Sweets such as
|
||||
|
||||
- 😇 Queueing requests by domain to be nice to them
|
||||
- 🔁 Retry pattern with exponential back-off
|
||||
|
||||
## CLI
|
||||
|
||||
Coming soon 🚧
|
||||
|
||||
## Programmatic usage
|
||||
|
||||
Programmatic usage is supported both on Node.js and browser.
|
||||
|
||||
### `getUrlStatusesInParallel`
|
||||
|
||||
```js
|
||||
// Simple example
|
||||
const statuses = await getUrlStatusesInParallel([ 'https://privacy.sexy', /* ... */ ]);
|
||||
if(statuses.all((r) => r.code === 200)) {
|
||||
console.log('All URLs are alive!');
|
||||
} else {
|
||||
console.log('Dead URLs:', statuses.filter((r) => r.code !== 200).map((r) => r.url));
|
||||
}
|
||||
|
||||
// Fastest configuration
|
||||
const statuses = await getUrlStatusesInParallel([ 'https://privacy.sexy', /* ... */ ], {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: false,
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
#### Batch request options
|
||||
|
||||
- `domainOptions`:
|
||||
- **`sameDomainParallelize`**, (*boolean*), default: `false`
|
||||
- Determines whether the requests to URLs under same domain will be parallelize.
|
||||
- Setting `false` parallelizes all requests.
|
||||
- Setting `true` sends requests in queue for each unique domain, still parallelizing for different domains.
|
||||
- Requests to different domains are always parallelized regardless of this option.
|
||||
- 💡 This helps to avoid `429 Too Many Requests` and be nice to websites
|
||||
- **`sameDomainDelayInMs`** (*boolean*), default: `3000` (3 seconds)
|
||||
- Sets delay between requests to same host (domain) if same domain parallelization is disabled.
|
||||
- `requestOptions` (*object*): See [request options](#request-options).
|
||||
|
||||
### `getUrlStatus`
|
||||
|
||||
Checks whether single URL is dead or alive.
|
||||
|
||||
```js
|
||||
// Simple example
|
||||
const status = await getUrlStatus('https://privacy.sexy');
|
||||
console.log(`Status code: ${status.code}`);
|
||||
```
|
||||
|
||||
#### Request options
|
||||
|
||||
- **`retryExponentialBaseInMs`** (*boolean*), default: `5000` (5 seconds)
|
||||
- The based time that's multiplied by exponential value for exponential backoff and retry calculations
|
||||
- The longer it is, the longer the delay between retries are.
|
||||
- **`additionalHeaders`** (*boolean*), default: `false`
|
||||
- Additional headers that will be sent alongside default headers mimicking browser.
|
||||
- If default header are specified, additional headers override defaults.
|
||||
- **`followOptions`** (*object*): See [follow options](#follow-options).
|
||||
|
||||
### `fetchFollow`
|
||||
|
||||
Gets response from single URL by following `3XX` redirect targets by sending necessary cookies.
|
||||
|
||||
Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter.
|
||||
|
||||
```js
|
||||
const status = await fetchFollow('https://privacy.sexy', {
|
||||
// First argument is same options as fetch API, except `redirect` options
|
||||
// that's discarded in favor of next argument follow options
|
||||
headers: {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
|
||||
},
|
||||
}, {
|
||||
// Second argument sets the redirect behavior
|
||||
followRedirects: true,
|
||||
maximumRedirectFollowDepth: 20,
|
||||
enableCookies: true,
|
||||
}
|
||||
);
|
||||
console.log(`Status code: ${status.code}`);
|
||||
```
|
||||
|
||||
#### Follow options
|
||||
|
||||
- **`followRedirects`** (*boolean*), default: `true`
|
||||
- Determines whether redirects with `3XX` response code will be followed.
|
||||
- **`maximumRedirectFollowDepth`** (*boolean*), default: `20`
|
||||
- Determines maximum consequent redirects that will be followed.
|
||||
- 💡 Helps to solve maximum redirect reached errors.
|
||||
- **`enableCookies`** (*boolean*), default: `true`
|
||||
- Saves cookies requested to store by webpages and sends them when redirected.
|
||||
- 💡 Helps to over-come sign-in challenges with callbacks.
|
||||
@@ -1,57 +0,0 @@
|
||||
import { retryWithExponentialBackOff } from './ExponentialBackOffRetryHandler';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
import { fetchFollow, IFollowOptions } from './FetchFollow';
|
||||
|
||||
export function getUrlStatus(
|
||||
url: string,
|
||||
options: IRequestOptions = DefaultOptions,
|
||||
): Promise<IUrlStatus> {
|
||||
options = { ...DefaultOptions, ...options };
|
||||
const fetchOptions = getFetchOptions(url, options);
|
||||
return retryWithExponentialBackOff(async () => {
|
||||
console.log('Requesting', url);
|
||||
let result: IUrlStatus;
|
||||
try {
|
||||
const response = await fetchFollow(url, fetchOptions, options.followOptions);
|
||||
result = { url, code: response.status };
|
||||
} catch (err) {
|
||||
result = { url, error: JSON.stringify(err, null, '\t') };
|
||||
}
|
||||
return result;
|
||||
}, options.retryExponentialBaseInMs);
|
||||
}
|
||||
|
||||
export interface IRequestOptions {
|
||||
retryExponentialBaseInMs?: number;
|
||||
additionalHeaders?: Record<string, string>;
|
||||
additionalHeadersUrlIgnore?: string[];
|
||||
followOptions?: IFollowOptions;
|
||||
}
|
||||
|
||||
const DefaultOptions: IRequestOptions = {
|
||||
retryExponentialBaseInMs: 5000,
|
||||
additionalHeaders: {},
|
||||
additionalHeadersUrlIgnore: [],
|
||||
};
|
||||
|
||||
function getFetchOptions(url: string, options: IRequestOptions): RequestInit {
|
||||
const additionalHeaders = options.additionalHeadersUrlIgnore
|
||||
.some((ignorePattern) => url.match(ignorePattern))
|
||||
? {}
|
||||
: options.additionalHeaders;
|
||||
return {
|
||||
method: 'GET',
|
||||
headers: { ...DefaultHeaders, ...additionalHeaders },
|
||||
};
|
||||
}
|
||||
|
||||
const DefaultHeaders: Record<string, string> = {
|
||||
/* Chrome on macOS */
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
|
||||
'upgrade-insecure-requests': '1',
|
||||
connection: 'keep-alive',
|
||||
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'cache-control': 'max-age=0',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
};
|
||||
@@ -1,19 +0,0 @@
|
||||
export function groupUrlsByDomain(urls: string[]): string[][] {
|
||||
const domains = new Set<string>();
|
||||
const urlsWithDomain = urls.map((url) => ({
|
||||
url,
|
||||
domain: extractDomain(url),
|
||||
}));
|
||||
for (const url of urlsWithDomain) {
|
||||
domains.add(url.domain);
|
||||
}
|
||||
return Array.from(domains).map((domain) => {
|
||||
return urlsWithDomain
|
||||
.filter((url) => url.domain === domain)
|
||||
.map((url) => url.url);
|
||||
});
|
||||
}
|
||||
|
||||
function extractDomain(url: string): string {
|
||||
return url.split('://')[1].split('/')[0].toLowerCase();
|
||||
}
|
||||
Reference in New Issue
Block a user