Fix failing URL status checking integration tests

Implement following redirects over `fetch` supporting cookies.
`node-fetch` does not support sending cookies during redirect. However,
this is needed to not end-up in a redirect loop for a sign-in callback.

Fix integration tests failing due to redirects and 403 errors:
  - Many redirects from `answers.microsoft.com` was throwing: throwing
    `FetchError: maximum redirect reached` error. It was caused by not
    having cookies when following redirects therefore having an infinite
    sign-in callback for the webpage.
  - Fixes integration tests failing due to additional referer header being
    sent by the application. It adds support for making exceptions to
    additional header sending through a list of regexes.

Add in-depth documentation for URL status checking.
This commit is contained in:
undergroundwires
2021-10-30 16:19:10 +01:00
parent 5ead1a087d
commit 799fb091b8
7 changed files with 202 additions and 17 deletions

View File

@@ -6,6 +6,7 @@ import { groupUrlsByDomain } from './UrlPerDomainGrouper';
export async function getUrlStatusesInParallelAsync(
urls: string[],
options?: IBatchRequestOptions): Promise<IUrlStatus[]> {
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
const uniqueUrls = Array.from(new Set(urls));
options = { ...DefaultOptions, ...options };
console.log('Options: ', options); // tslint:disable-line: no-console

View File

@@ -25,8 +25,8 @@ function shouldRetry(status: IUrlStatus) {
if (status.error) {
return true;
}
return isTransientError(status.statusCode)
|| status.statusCode === 429; // Too Many Requests
return isTransientError(status.code)
|| status.code === 429; // Too Many Requests
}
function isTransientError(statusCode: number) {

View File

@@ -0,0 +1,66 @@
import fetch from 'cross-fetch';
export function fetchFollow(
url: string, fetchOptions: RequestInit, followOptions: IFollowOptions): Promise<Response> {
followOptions = { ...DefaultOptions, ...followOptions };
if (!followOptions.followRedirects
|| followOptions.maximumRedirectFollowDepth === 0) {
return fetch(url, fetchOptions);
}
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
const cookies = new CookieStorage(followOptions.enableCookies);
return followRecursivelyWithCookies(
url, fetchOptions, followOptions.maximumRedirectFollowDepth, cookies);
}
export interface IFollowOptions {
followRedirects?: boolean;
maximumRedirectFollowDepth?: number;
enableCookies?: boolean;
}
const DefaultOptions: IFollowOptions = {
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
};
async function followRecursivelyWithCookies(
url: string, options: RequestInit, followDepth: number, cookies: CookieStorage): Promise<Response> {
if (cookies.hasAny()) {
options = { ...options, headers: { ...options.headers, cookie: cookies.getHeader() } };
}
const response = await fetch(url, options);
if (!isRedirect(response.status)) {
return response;
}
if (--followDepth < 0) {
throw new Error(`[max-redirect] maximum redirect reached at: ${url}`);
}
const cookieHeader = response.headers.get('set-cookie');
cookies.addHeader(cookieHeader);
const nextUrl = response.headers.get('location');
return followRecursivelyWithCookies(nextUrl, options, followDepth, cookies);
}
function isRedirect(code: number): boolean {
return code === 301 || code === 302 || code === 303 || code === 307 || code === 308;
}
class CookieStorage {
public cookies = new Array<string>();
constructor(private readonly enabled: boolean) {
}
public hasAny() {
return this.enabled && this.cookies.length > 0;
}
public addHeader(header: string) {
if (!this.enabled || !header) {
return;
}
this.cookies.push(header);
}
public getHeader() {
return this.cookies.join(' ; ');
}
}

View File

@@ -1,5 +1,5 @@
export interface IUrlStatus {
url: string;
error?: any;
statusCode?: number;
code?: number;
}

View File

@@ -0,0 +1,108 @@
# status-checker
CLI and SDK to check whether an external URL is alive.
🧐 Why?
- 🏃🏻 Batch checking status of URLs in parallel.
- 🤖 Zero-touch start, pre-configured for reliable results, still configurable.
- 🤞 Reliable, mimics a real web browser by following redirect, and cookie storage.
🍭 Sweets such as
- 😇 Queueing requests by domain to be nice to them
- 🔁 Retry pattern with exponential back-off
## CLI
Coming soon 🚧
## Programmatic usage
Programmatic usage is supported both on Node.js and browser.
### `getUrlStatusesInParallelAsync`
```js
// Simple example
const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ]);
if(statuses.all((r) => r.code === 200)) {
console.log('All URLs are alive!');
} else {
console.log('Dead URLs:', statuses.filter((r) => r.code !== 200).map((r) => r.url));
}
// Fastest configuration
const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ], {
domainOptions: {
sameDomainParallelize: false,
}
});
```
#### Batch request options
- `domainOptions`:
- **`sameDomainParallelize`**, (*boolean*), default: `false`
- Determines whether the requests to URLs under same domain will be parallelize.
- Setting `false` parallelizes all requests.
- Setting `true` sends requests in queue for each unique domain, still parallelizing for different domains.
- Requests to different domains are always parallelized regardless of this option.
- 💡 This helps to avoid `429 Too Many Requests` and be nice to websites
- **`sameDomainDelayInMs`** (*boolean*), default: `3000` (3 seconds)
- Sets delay between requests to same host (domain) if same domain parallelization is disabled.
- `requestOptions` (*object*): See [request options](#request-options).
### `getUrlStatusAsync`
Checks whether single URL is dead or alive.
```js
// Simple example
const status = await getUrlStatusAsync('https://privacy.sexy');
console.log(`Status code: ${status.code}`);
```
#### Request options
- **`retryExponentialBaseInMs`** (*boolean*), default: `5000` (5 seconds)
- The based time that's multiplied by exponential value for exponential backoff and retry calculations
- The longer it is, the longer the delay between retries are.
- **`additionalHeaders`** (*boolean*), default: `false`
- Additional headers that will be sent alongside default headers mimicking browser.
- If default header are specified, additional headers override defaults.
- **`followOptions`** (*object*): See [follow options](#follow-options).
### `fetchFollow`
Gets response from single URL by following `3XX` redirect targets by sending necessary cookies.
Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter.
```js
const status = await fetchFollow('https://privacy.sexy', {
// First argument is same options as fetch API, except `redirect` options
// that's discarded in favor of next argument follow options
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
},
}, {
// Second argument sets the redirect behavior
followRedirects: true,
maximumRedirectFollowDepth: 20,
enableCookies: true,
}
);
console.log(`Status code: ${status.code}`);
```
#### Follow options
- **`followRedirects`** (*boolean*), default: `true`
- Determines whether redirects with `3XX` response code will be followed.
- **`maximumRedirectFollowDepth`** (*boolean*), default: `20`
- Determines maximum consequent redirects that will be followed.
- 💡 Helps to solve maximum redirect reached errors.
- **`enableCookies`** (*boolean*), default: `true`
- Saves cookies requested to store by webpages and sends them when redirected.
- 💡 Helps to over-come sign-in challenges with callbacks.

View File

@@ -1,37 +1,44 @@
import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler';
import { IUrlStatus } from './IUrlStatus';
import fetch from 'cross-fetch';
export interface IRequestOptions {
retryExponentialBaseInMs?: number;
additionalHeaders?: Record<string, string>;
}
import { fetchFollow, IFollowOptions } from './FetchFollow';
export async function getUrlStatusAsync(
url: string,
options: IRequestOptions = DefaultOptions): Promise<IUrlStatus> {
options = { ...DefaultOptions, ...options };
const fetchOptions = getFetchOptions(options);
const fetchOptions = getFetchOptions(url, options);
return retryWithExponentialBackOffAsync(async () => {
console.log('Requesting', url); // tslint:disable-line: no-console
let result: IUrlStatus;
try {
const response = await fetch(url, fetchOptions);
return { url, statusCode: response.status};
const response = await fetchFollow(url, fetchOptions, options.followOptions);
result = { url, code: response.status };
} catch (err) {
return { url, error: err};
result = { url, error: err };
}
return result;
}, options.retryExponentialBaseInMs);
}
export interface IRequestOptions {
retryExponentialBaseInMs?: number;
additionalHeaders?: Record<string, string>;
additionalHeadersUrlIgnore?: string[];
followOptions?: IFollowOptions;
}
const DefaultOptions: IRequestOptions = {
retryExponentialBaseInMs: 5000,
additionalHeaders: {},
additionalHeadersUrlIgnore: [],
};
function getFetchOptions(options: IRequestOptions) {
function getFetchOptions(url: string, options: IRequestOptions): RequestInit {
const additionalHeaders = options.additionalHeadersUrlIgnore.some(
(ignorePattern) => url.match(ignorePattern)) ? {} : options.additionalHeaders;
return {
method: 'GET',
headers: { ...DefaultHeaders, ...options.additionalHeaders },
headers: { ...DefaultHeaders, ...additionalHeaders },
};
}