Fix failing URL status checking integration tests
Implement following redirects over `fetch` supporting cookies.
`node-fetch` does not support sending cookies during redirect. However,
this is needed to not end-up in a redirect loop for a sign-in callback.
Fix integration tests failing due to redirects and 403 errors:
- Many redirects from `answers.microsoft.com` was throwing: throwing
`FetchError: maximum redirect reached` error. It was caused by not
having cookies when following redirects therefore having an infinite
sign-in callback for the webpage.
- Fixes integration tests failing due to additional referer header being
sent by the application. It adds support for making exceptions to
additional header sending through a list of regexes.
Add in-depth documentation for URL status checking.
This commit is contained in:
@@ -17,6 +17,9 @@ describe('collections', () => {
|
|||||||
requestOptions: {
|
requestOptions: {
|
||||||
retryExponentialBaseInMs: 3 /* sec */ * 1000,
|
retryExponentialBaseInMs: 3 /* sec */ * 1000,
|
||||||
additionalHeaders: { referer: app.info.homepage },
|
additionalHeaders: { referer: app.info.homepage },
|
||||||
|
additionalHeadersUrlIgnore: [
|
||||||
|
'http://batcmd.com/', // Otherwise it responds with 403
|
||||||
|
],
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
const testTimeoutInMs = urls.length * 60000 /* 1 minute */;
|
const testTimeoutInMs = urls.length * 60000 /* 1 minute */;
|
||||||
@@ -24,7 +27,7 @@ describe('collections', () => {
|
|||||||
// act
|
// act
|
||||||
const results = await getUrlStatusesInParallelAsync(urls, options);
|
const results = await getUrlStatusesInParallelAsync(urls, options);
|
||||||
// assert
|
// assert
|
||||||
const deadUrls = results.filter((r) => r.statusCode !== 200);
|
const deadUrls = results.filter((r) => r.code !== 200);
|
||||||
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
|
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
|
||||||
}).timeout(testTimeoutInMs);
|
}).timeout(testTimeoutInMs);
|
||||||
});
|
});
|
||||||
@@ -41,7 +44,7 @@ function printUrls(statuses: IUrlStatus[]): string {
|
|||||||
return '\n' +
|
return '\n' +
|
||||||
statuses.map((status) =>
|
statuses.map((status) =>
|
||||||
`- ${status.url}\n` +
|
`- ${status.url}\n` +
|
||||||
(status.statusCode ? `\tResponse code: ${status.statusCode}` : '') +
|
(status.code ? `\tResponse code: ${status.code}` : '') +
|
||||||
(status.error ? `\tException: ${JSON.stringify(status.error, null, '\t')}` : ''))
|
(status.error ? `\tException: ${JSON.stringify(status.error, null, '\t')}` : ''))
|
||||||
.join(`\n`)
|
.join(`\n`)
|
||||||
+ '\n';
|
+ '\n';
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import { groupUrlsByDomain } from './UrlPerDomainGrouper';
|
|||||||
export async function getUrlStatusesInParallelAsync(
|
export async function getUrlStatusesInParallelAsync(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
options?: IBatchRequestOptions): Promise<IUrlStatus[]> {
|
options?: IBatchRequestOptions): Promise<IUrlStatus[]> {
|
||||||
|
// urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing
|
||||||
const uniqueUrls = Array.from(new Set(urls));
|
const uniqueUrls = Array.from(new Set(urls));
|
||||||
options = { ...DefaultOptions, ...options };
|
options = { ...DefaultOptions, ...options };
|
||||||
console.log('Options: ', options); // tslint:disable-line: no-console
|
console.log('Options: ', options); // tslint:disable-line: no-console
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ function shouldRetry(status: IUrlStatus) {
|
|||||||
if (status.error) {
|
if (status.error) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return isTransientError(status.statusCode)
|
return isTransientError(status.code)
|
||||||
|| status.statusCode === 429; // Too Many Requests
|
|| status.code === 429; // Too Many Requests
|
||||||
}
|
}
|
||||||
|
|
||||||
function isTransientError(statusCode: number) {
|
function isTransientError(statusCode: number) {
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
import fetch from 'cross-fetch';
|
||||||
|
|
||||||
|
export function fetchFollow(
|
||||||
|
url: string, fetchOptions: RequestInit, followOptions: IFollowOptions): Promise<Response> {
|
||||||
|
followOptions = { ...DefaultOptions, ...followOptions };
|
||||||
|
if (!followOptions.followRedirects
|
||||||
|
|| followOptions.maximumRedirectFollowDepth === 0) {
|
||||||
|
return fetch(url, fetchOptions);
|
||||||
|
}
|
||||||
|
fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ };
|
||||||
|
const cookies = new CookieStorage(followOptions.enableCookies);
|
||||||
|
return followRecursivelyWithCookies(
|
||||||
|
url, fetchOptions, followOptions.maximumRedirectFollowDepth, cookies);
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IFollowOptions {
|
||||||
|
followRedirects?: boolean;
|
||||||
|
maximumRedirectFollowDepth?: number;
|
||||||
|
enableCookies?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DefaultOptions: IFollowOptions = {
|
||||||
|
followRedirects: true,
|
||||||
|
maximumRedirectFollowDepth: 20,
|
||||||
|
enableCookies: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
async function followRecursivelyWithCookies(
|
||||||
|
url: string, options: RequestInit, followDepth: number, cookies: CookieStorage): Promise<Response> {
|
||||||
|
if (cookies.hasAny()) {
|
||||||
|
options = { ...options, headers: { ...options.headers, cookie: cookies.getHeader() } };
|
||||||
|
}
|
||||||
|
const response = await fetch(url, options);
|
||||||
|
if (!isRedirect(response.status)) {
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
if (--followDepth < 0) {
|
||||||
|
throw new Error(`[max-redirect] maximum redirect reached at: ${url}`);
|
||||||
|
}
|
||||||
|
const cookieHeader = response.headers.get('set-cookie');
|
||||||
|
cookies.addHeader(cookieHeader);
|
||||||
|
const nextUrl = response.headers.get('location');
|
||||||
|
return followRecursivelyWithCookies(nextUrl, options, followDepth, cookies);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isRedirect(code: number): boolean {
|
||||||
|
return code === 301 || code === 302 || code === 303 || code === 307 || code === 308;
|
||||||
|
}
|
||||||
|
|
||||||
|
class CookieStorage {
|
||||||
|
public cookies = new Array<string>();
|
||||||
|
constructor(private readonly enabled: boolean) {
|
||||||
|
}
|
||||||
|
public hasAny() {
|
||||||
|
return this.enabled && this.cookies.length > 0;
|
||||||
|
}
|
||||||
|
public addHeader(header: string) {
|
||||||
|
if (!this.enabled || !header) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.cookies.push(header);
|
||||||
|
}
|
||||||
|
public getHeader() {
|
||||||
|
return this.cookies.join(' ; ');
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
export interface IUrlStatus {
|
export interface IUrlStatus {
|
||||||
url: string;
|
url: string;
|
||||||
error?: any;
|
error?: any;
|
||||||
statusCode?: number;
|
code?: number;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,108 @@
|
|||||||
|
# status-checker
|
||||||
|
|
||||||
|
CLI and SDK to check whether an external URL is alive.
|
||||||
|
|
||||||
|
🧐 Why?
|
||||||
|
|
||||||
|
- 🏃🏻 Batch checking status of URLs in parallel.
|
||||||
|
- 🤖 Zero-touch start, pre-configured for reliable results, still configurable.
|
||||||
|
- 🤞 Reliable, mimics a real web browser by following redirect, and cookie storage.
|
||||||
|
|
||||||
|
🍭 Sweets such as
|
||||||
|
|
||||||
|
- 😇 Queueing requests by domain to be nice to them
|
||||||
|
- 🔁 Retry pattern with exponential back-off
|
||||||
|
|
||||||
|
## CLI
|
||||||
|
|
||||||
|
Coming soon 🚧
|
||||||
|
|
||||||
|
## Programmatic usage
|
||||||
|
|
||||||
|
Programmatic usage is supported both on Node.js and browser.
|
||||||
|
|
||||||
|
### `getUrlStatusesInParallelAsync`
|
||||||
|
|
||||||
|
```js
|
||||||
|
// Simple example
|
||||||
|
const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ]);
|
||||||
|
if(statuses.all((r) => r.code === 200)) {
|
||||||
|
console.log('All URLs are alive!');
|
||||||
|
} else {
|
||||||
|
console.log('Dead URLs:', statuses.filter((r) => r.code !== 200).map((r) => r.url));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fastest configuration
|
||||||
|
const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ], {
|
||||||
|
domainOptions: {
|
||||||
|
sameDomainParallelize: false,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Batch request options
|
||||||
|
|
||||||
|
- `domainOptions`:
|
||||||
|
- **`sameDomainParallelize`**, (*boolean*), default: `false`
|
||||||
|
- Determines whether the requests to URLs under same domain will be parallelize.
|
||||||
|
- Setting `false` parallelizes all requests.
|
||||||
|
- Setting `true` sends requests in queue for each unique domain, still parallelizing for different domains.
|
||||||
|
- Requests to different domains are always parallelized regardless of this option.
|
||||||
|
- 💡 This helps to avoid `429 Too Many Requests` and be nice to websites
|
||||||
|
- **`sameDomainDelayInMs`** (*boolean*), default: `3000` (3 seconds)
|
||||||
|
- Sets delay between requests to same host (domain) if same domain parallelization is disabled.
|
||||||
|
- `requestOptions` (*object*): See [request options](#request-options).
|
||||||
|
|
||||||
|
### `getUrlStatusAsync`
|
||||||
|
|
||||||
|
Checks whether single URL is dead or alive.
|
||||||
|
|
||||||
|
```js
|
||||||
|
// Simple example
|
||||||
|
const status = await getUrlStatusAsync('https://privacy.sexy');
|
||||||
|
console.log(`Status code: ${status.code}`);
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Request options
|
||||||
|
|
||||||
|
- **`retryExponentialBaseInMs`** (*boolean*), default: `5000` (5 seconds)
|
||||||
|
- The based time that's multiplied by exponential value for exponential backoff and retry calculations
|
||||||
|
- The longer it is, the longer the delay between retries are.
|
||||||
|
- **`additionalHeaders`** (*boolean*), default: `false`
|
||||||
|
- Additional headers that will be sent alongside default headers mimicking browser.
|
||||||
|
- If default header are specified, additional headers override defaults.
|
||||||
|
- **`followOptions`** (*object*): See [follow options](#follow-options).
|
||||||
|
|
||||||
|
### `fetchFollow`
|
||||||
|
|
||||||
|
Gets response from single URL by following `3XX` redirect targets by sending necessary cookies.
|
||||||
|
|
||||||
|
Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter.
|
||||||
|
|
||||||
|
```js
|
||||||
|
const status = await fetchFollow('https://privacy.sexy', {
|
||||||
|
// First argument is same options as fetch API, except `redirect` options
|
||||||
|
// that's discarded in favor of next argument follow options
|
||||||
|
headers: {
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
// Second argument sets the redirect behavior
|
||||||
|
followRedirects: true,
|
||||||
|
maximumRedirectFollowDepth: 20,
|
||||||
|
enableCookies: true,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
console.log(`Status code: ${status.code}`);
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Follow options
|
||||||
|
|
||||||
|
- **`followRedirects`** (*boolean*), default: `true`
|
||||||
|
- Determines whether redirects with `3XX` response code will be followed.
|
||||||
|
- **`maximumRedirectFollowDepth`** (*boolean*), default: `20`
|
||||||
|
- Determines maximum consequent redirects that will be followed.
|
||||||
|
- 💡 Helps to solve maximum redirect reached errors.
|
||||||
|
- **`enableCookies`** (*boolean*), default: `true`
|
||||||
|
- Saves cookies requested to store by webpages and sends them when redirected.
|
||||||
|
- 💡 Helps to over-come sign-in challenges with callbacks.
|
||||||
@@ -1,37 +1,44 @@
|
|||||||
import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler';
|
import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler';
|
||||||
import { IUrlStatus } from './IUrlStatus';
|
import { IUrlStatus } from './IUrlStatus';
|
||||||
import fetch from 'cross-fetch';
|
import { fetchFollow, IFollowOptions } from './FetchFollow';
|
||||||
|
|
||||||
export interface IRequestOptions {
|
|
||||||
retryExponentialBaseInMs?: number;
|
|
||||||
additionalHeaders?: Record<string, string>;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getUrlStatusAsync(
|
export async function getUrlStatusAsync(
|
||||||
url: string,
|
url: string,
|
||||||
options: IRequestOptions = DefaultOptions): Promise<IUrlStatus> {
|
options: IRequestOptions = DefaultOptions): Promise<IUrlStatus> {
|
||||||
options = { ...DefaultOptions, ...options };
|
options = { ...DefaultOptions, ...options };
|
||||||
const fetchOptions = getFetchOptions(options);
|
const fetchOptions = getFetchOptions(url, options);
|
||||||
return retryWithExponentialBackOffAsync(async () => {
|
return retryWithExponentialBackOffAsync(async () => {
|
||||||
console.log('Requesting', url); // tslint:disable-line: no-console
|
console.log('Requesting', url); // tslint:disable-line: no-console
|
||||||
|
let result: IUrlStatus;
|
||||||
try {
|
try {
|
||||||
const response = await fetch(url, fetchOptions);
|
const response = await fetchFollow(url, fetchOptions, options.followOptions);
|
||||||
return { url, statusCode: response.status};
|
result = { url, code: response.status };
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return { url, error: err};
|
result = { url, error: err };
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
}, options.retryExponentialBaseInMs);
|
}, options.retryExponentialBaseInMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface IRequestOptions {
|
||||||
|
retryExponentialBaseInMs?: number;
|
||||||
|
additionalHeaders?: Record<string, string>;
|
||||||
|
additionalHeadersUrlIgnore?: string[];
|
||||||
|
followOptions?: IFollowOptions;
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultOptions: IRequestOptions = {
|
const DefaultOptions: IRequestOptions = {
|
||||||
retryExponentialBaseInMs: 5000,
|
retryExponentialBaseInMs: 5000,
|
||||||
additionalHeaders: {},
|
additionalHeaders: {},
|
||||||
|
additionalHeadersUrlIgnore: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
function getFetchOptions(options: IRequestOptions) {
|
function getFetchOptions(url: string, options: IRequestOptions): RequestInit {
|
||||||
|
const additionalHeaders = options.additionalHeadersUrlIgnore.some(
|
||||||
|
(ignorePattern) => url.match(ignorePattern)) ? {} : options.additionalHeaders;
|
||||||
return {
|
return {
|
||||||
method: 'GET',
|
method: 'GET',
|
||||||
headers: { ...DefaultHeaders, ...options.additionalHeaders },
|
headers: { ...DefaultHeaders, ...additionalHeaders },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user