diff --git a/tests/integration/application/collections/NoDeadDocumentationUrls.spec.ts b/tests/integration/application/collections/NoDeadDocumentationUrls.spec.ts index b79439d0..f42a4b89 100644 --- a/tests/integration/application/collections/NoDeadDocumentationUrls.spec.ts +++ b/tests/integration/application/collections/NoDeadDocumentationUrls.spec.ts @@ -17,6 +17,9 @@ describe('collections', () => { requestOptions: { retryExponentialBaseInMs: 3 /* sec */ * 1000, additionalHeaders: { referer: app.info.homepage }, + additionalHeadersUrlIgnore: [ + 'http://batcmd.com/', // Otherwise it responds with 403 + ], }, }; const testTimeoutInMs = urls.length * 60000 /* 1 minute */; @@ -24,7 +27,7 @@ describe('collections', () => { // act const results = await getUrlStatusesInParallelAsync(urls, options); // assert - const deadUrls = results.filter((r) => r.statusCode !== 200); + const deadUrls = results.filter((r) => r.code !== 200); expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls)); }).timeout(testTimeoutInMs); }); @@ -41,7 +44,7 @@ function printUrls(statuses: IUrlStatus[]): string { return '\n' + statuses.map((status) => `- ${status.url}\n` + - (status.statusCode ? `\tResponse code: ${status.statusCode}` : '') + + (status.code ? `\tResponse code: ${status.code}` : '') + (status.error ? `\tException: ${JSON.stringify(status.error, null, '\t')}` : '')) .join(`\n`) + '\n'; diff --git a/tests/integration/application/collections/StatusChecker/BatchStatusChecker.ts b/tests/integration/application/collections/StatusChecker/BatchStatusChecker.ts index 64067c39..81a3735c 100644 --- a/tests/integration/application/collections/StatusChecker/BatchStatusChecker.ts +++ b/tests/integration/application/collections/StatusChecker/BatchStatusChecker.ts @@ -6,6 +6,7 @@ import { groupUrlsByDomain } from './UrlPerDomainGrouper'; export async function getUrlStatusesInParallelAsync( urls: string[], options?: IBatchRequestOptions): Promise { + // urls = [ 'https://privacy.sexy' ]; // Here to comment out when testing const uniqueUrls = Array.from(new Set(urls)); options = { ...DefaultOptions, ...options }; console.log('Options: ', options); // tslint:disable-line: no-console diff --git a/tests/integration/application/collections/StatusChecker/ExponentialBackOffRetryHandler.ts b/tests/integration/application/collections/StatusChecker/ExponentialBackOffRetryHandler.ts index 74a379cb..ea1fcfaf 100644 --- a/tests/integration/application/collections/StatusChecker/ExponentialBackOffRetryHandler.ts +++ b/tests/integration/application/collections/StatusChecker/ExponentialBackOffRetryHandler.ts @@ -25,8 +25,8 @@ function shouldRetry(status: IUrlStatus) { if (status.error) { return true; } - return isTransientError(status.statusCode) - ||ย status.statusCode === 429; // Too Many Requests + return isTransientError(status.code) + ||ย status.code === 429; // Too Many Requests } function isTransientError(statusCode: number) { diff --git a/tests/integration/application/collections/StatusChecker/FetchFollow.ts b/tests/integration/application/collections/StatusChecker/FetchFollow.ts new file mode 100644 index 00000000..9ba98ca9 --- /dev/null +++ b/tests/integration/application/collections/StatusChecker/FetchFollow.ts @@ -0,0 +1,66 @@ +import fetch from 'cross-fetch'; + +export function fetchFollow( + url: string, fetchOptions: RequestInit, followOptions: IFollowOptions): Promise { + followOptions = { ...DefaultOptions, ...followOptions }; + if (!followOptions.followRedirects + || followOptions.maximumRedirectFollowDepth === 0) { + return fetch(url, fetchOptions); + } + fetchOptions = { ...fetchOptions, redirect: 'manual' /* handled manually */ }; + const cookies = new CookieStorage(followOptions.enableCookies); + return followRecursivelyWithCookies( + url, fetchOptions, followOptions.maximumRedirectFollowDepth, cookies); +} + +export interface IFollowOptions { + followRedirects?: boolean; + maximumRedirectFollowDepth?: number; + enableCookies?: boolean; +} + +const DefaultOptions: IFollowOptions = { + followRedirects: true, + maximumRedirectFollowDepth: 20, + enableCookies: true, +}; + +async function followRecursivelyWithCookies( + url: string, options: RequestInit, followDepth: number, cookies: CookieStorage): Promise { + if (cookies.hasAny()) { + options = { ...options, headers: { ...options.headers, cookie: cookies.getHeader() } }; + } + const response = await fetch(url, options); + if (!isRedirect(response.status)) { + return response; + } + if (--followDepth < 0) { + throw new Error(`[max-redirect] maximum redirect reached at: ${url}`); + } + const cookieHeader = response.headers.get('set-cookie'); + cookies.addHeader(cookieHeader); + const nextUrl = response.headers.get('location'); + return followRecursivelyWithCookies(nextUrl, options, followDepth, cookies); +} + +function isRedirect(code: number): boolean { + return code === 301 || code === 302 || code === 303 || code === 307 || code === 308; +} + +class CookieStorage { + public cookies = new Array(); + constructor(private readonly enabled: boolean) { + } + public hasAny() { + return this.enabled && this.cookies.length > 0; + } + public addHeader(header: string) { + if (!this.enabled || !header) { + return; + } + this.cookies.push(header); + } + public getHeader() { + return this.cookies.join(' ; '); + } +} diff --git a/tests/integration/application/collections/StatusChecker/IUrlStatus.ts b/tests/integration/application/collections/StatusChecker/IUrlStatus.ts index 7e581e9a..77104fa2 100644 --- a/tests/integration/application/collections/StatusChecker/IUrlStatus.ts +++ b/tests/integration/application/collections/StatusChecker/IUrlStatus.ts @@ -1,5 +1,5 @@ export interface IUrlStatus { url: string; error?: any; - statusCode?: number; + code?: number; } diff --git a/tests/integration/application/collections/StatusChecker/README.md b/tests/integration/application/collections/StatusChecker/README.md new file mode 100644 index 00000000..ad5b0770 --- /dev/null +++ b/tests/integration/application/collections/StatusChecker/README.md @@ -0,0 +1,108 @@ +# status-checker + +CLI and SDK to check whether an external URL is alive. + +๐Ÿง Why? + +- ๐Ÿƒ๐Ÿป Batch checking status of URLs in parallel. +- ๐Ÿค– Zero-touch start, pre-configured for reliable results, still configurable. +- ๐Ÿคž Reliable, mimics a real web browser by following redirect, and cookie storage. + +๐Ÿญ Sweets such as + +- ๐Ÿ˜‡ Queueing requests by domain to be nice to them +- ๐Ÿ” Retry pattern with exponential back-off + +## CLI + +Coming soon ๐Ÿšง + +## Programmatic usage + +Programmatic usage is supported both on Node.js and browser. + +### `getUrlStatusesInParallelAsync` + +```js +// Simple example +const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ]); +if(statuses.all((r) => r.code === 200)) { + console.log('All URLs are alive!'); +} else { + console.log('Dead URLs:', statuses.filter((r) => r.code !== 200).map((r) => r.url)); +} + +// Fastest configuration +const statuses = await getUrlStatusesInParallelAsync([ 'https://privacy.sexy', /* ... */ ], { + domainOptions: { + sameDomainParallelize: false, + } +}); +``` + +#### Batch request options + +- `domainOptions`: + - **`sameDomainParallelize`**, (*boolean*), default: `false` + - Determines whether the requests to URLs under same domain will be parallelize. + - Setting `false` parallelizes all requests. + - Setting `true` sends requests in queue for each unique domain, still parallelizing for different domains. + - Requests to different domains are always parallelized regardless of this option. + - ๐Ÿ’ก This helps to avoid `429 Too Many Requests` and be nice to websites + - **`sameDomainDelayInMs`** (*boolean*), default: `3000` (3 seconds) + - Sets delay between requests to same host (domain) if same domain parallelization is disabled. +- `requestOptions` (*object*): See [request options](#request-options). + +### `getUrlStatusAsync` + +Checks whether single URL is dead or alive. + +```js +// Simple example +const status = await getUrlStatusAsync('https://privacy.sexy'); +console.log(`Status code: ${status.code}`); +``` + +#### Request options + +- **`retryExponentialBaseInMs`** (*boolean*), default: `5000` (5 seconds) + - The based time that's multiplied by exponential value for exponential backoff and retry calculations + - The longer it is, the longer the delay between retries are. +- **`additionalHeaders`** (*boolean*), default: `false` + - Additional headers that will be sent alongside default headers mimicking browser. + - If default header are specified, additional headers override defaults. +- **`followOptions`** (*object*): See [follow options](#follow-options). + +### `fetchFollow` + +Gets response from single URL by following `3XX` redirect targets by sending necessary cookies. + +Same fetch API except third parameter that specifies [follow options](#follow-options), `redirect: 'follow' | 'manual' | 'error'` is discarded in favor of the third parameter. + +```js +const status = await fetchFollow('https://privacy.sexy', { + // First argument is same options as fetch API, except `redirect` options + // that's discarded in favor of next argument follow options + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0' + }, + }, { + // Second argument sets the redirect behavior + followRedirects: true, + maximumRedirectFollowDepth: 20, + enableCookies: true, + } +); +console.log(`Status code: ${status.code}`); +``` + +#### Follow options + +- **`followRedirects`** (*boolean*), default: `true` + - Determines whether redirects with `3XX` response code will be followed. +- **`maximumRedirectFollowDepth`** (*boolean*), default: `20` + - Determines maximum consequent redirects that will be followed. + - ๐Ÿ’ก Helps to solve maximum redirect reached errors. +- **`enableCookies`** (*boolean*), default: `true` + - Saves cookies requested to store by webpages and sends them when redirected. + - ๐Ÿ’ก Helps to over-come sign-in challenges with callbacks. diff --git a/tests/integration/application/collections/StatusChecker/Requestor.ts b/tests/integration/application/collections/StatusChecker/Requestor.ts index b4683bcb..106e1163 100644 --- a/tests/integration/application/collections/StatusChecker/Requestor.ts +++ b/tests/integration/application/collections/StatusChecker/Requestor.ts @@ -1,37 +1,44 @@ import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler'; import { IUrlStatus } from './IUrlStatus'; -import fetch from 'cross-fetch'; - -export interface IRequestOptions { - retryExponentialBaseInMs?: number; - additionalHeaders?: Record; -} +import { fetchFollow, IFollowOptions } from './FetchFollow'; export async function getUrlStatusAsync( url: string, options: IRequestOptions = DefaultOptions): Promiseย { options = { ...DefaultOptions, ...options }; - const fetchOptions = getFetchOptions(options); + const fetchOptions = getFetchOptions(url, options); return retryWithExponentialBackOffAsync(async () => { console.log('Requesting', url); // tslint:disable-line: no-console + let result: IUrlStatus; try { - const response = await fetch(url, fetchOptions); - return { url, statusCode: response.status}; + const response = await fetchFollow(url, fetchOptions, options.followOptions); + result = { url, code: response.status }; } catch (err) { - return { url, error: err}; + result = { url, error: err }; } + return result; }, options.retryExponentialBaseInMs); } +export interface IRequestOptions { + retryExponentialBaseInMs?: number; + additionalHeaders?: Record; + additionalHeadersUrlIgnore?: string[]; + followOptions?: IFollowOptions; +} + const DefaultOptions: IRequestOptions = { retryExponentialBaseInMs: 5000, additionalHeaders: {}, + additionalHeadersUrlIgnore: [], }; -function getFetchOptions(options: IRequestOptions) { +function getFetchOptions(url: string, options: IRequestOptions): RequestInit { + const additionalHeaders = options.additionalHeadersUrlIgnore.some( + (ignorePattern) => url.match(ignorePattern)) ? {} : options.additionalHeaders; return { method: 'GET', - headers: { ...DefaultHeaders, ...options.additionalHeaders }, + headers: { ...DefaultHeaders, ...additionalHeaders }, }; }