fix broken URLs and automate broken URL checks #70
This commit: - Fixes broken URLs using archive.org or other references. - Replaces tenforums.com URLs with better documentation as they tend to return HTTP status code 403 to tests and also are low quality source. - Changes all insecure http sources to https alternatives - Adds integration tests to check for broken URLs - There's logic implemented for having a delay inbetween when sending requests to same domains, however it's not used as the sources can respond to totally parallelized requests. - Run test pipeline weekly to get notified about broken URls without commits
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
import 'mocha';
|
||||
import { expect } from 'chai';
|
||||
import { parseApplication } from '@/application/Parser/ApplicationParser';
|
||||
import { IApplication } from '@/domain/IApplication';
|
||||
import { IUrlStatus } from './StatusChecker/IUrlStatus';
|
||||
import { getUrlStatusesInParallelAsync, IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
|
||||
|
||||
describe('collections', () => {
|
||||
// arrange
|
||||
const app = parseApplication();
|
||||
const urls = collectUniqueUrls(app);
|
||||
const options: IBatchRequestOptions = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: true, // no need to be so nice until sources start failing
|
||||
// sameDomainDelayInMs: 2 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 3 /* sec */ * 1000,
|
||||
additionalHeaders: { referer: app.info.homepage },
|
||||
},
|
||||
};
|
||||
const testTimeoutInMs = urls.length * 60000 /* 1 minute */;
|
||||
it('have no dead urls', async () => {
|
||||
// act
|
||||
const results = await getUrlStatusesInParallelAsync(urls, options);
|
||||
// assert
|
||||
const deadUrls = results.filter((r) => r.statusCode !== 200);
|
||||
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
|
||||
}).timeout(testTimeoutInMs);
|
||||
});
|
||||
|
||||
function collectUniqueUrls(app: IApplication): string[] {
|
||||
return app
|
||||
.collections
|
||||
.flatMap((a) => a.getAllScripts())
|
||||
.flatMap((script) => script.documentationUrls)
|
||||
.filter((url, index, array) => array.indexOf(url) === index);
|
||||
}
|
||||
|
||||
function printUrls(statuses: IUrlStatus[]): string {
|
||||
return '\n' +
|
||||
statuses.map((status) =>
|
||||
`- ${status.url}\n` +
|
||||
(status.statusCode ? `\tResponse code: ${status.statusCode}` : '') +
|
||||
(status.error ? `\tException: ${JSON.stringify(status.error, null, '\t')}` : ''))
|
||||
.join(`\n`)
|
||||
+ '\n';
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
import { sleepAsync } from '@/infrastructure/Threading/AsyncSleep';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
import { getUrlStatusAsync, IRequestOptions } from './Requestor';
|
||||
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
|
||||
|
||||
export async function getUrlStatusesInParallelAsync(
|
||||
urls: string[],
|
||||
options?: IBatchRequestOptions): Promise<IUrlStatus[]> {
|
||||
const uniqueUrls = Array.from(new Set(urls));
|
||||
options = { ...DefaultOptions, ...options };
|
||||
console.log('Options: ', options); // tslint:disable-line: no-console
|
||||
const results = await requestAsync(uniqueUrls, options);
|
||||
return results;
|
||||
}
|
||||
|
||||
export interface IBatchRequestOptions {
|
||||
domainOptions?: IDomainOptions;
|
||||
requestOptions?: IRequestOptions;
|
||||
}
|
||||
|
||||
interface IDomainOptions {
|
||||
sameDomainParallelize?: boolean;
|
||||
sameDomainDelayInMs?: number;
|
||||
}
|
||||
|
||||
const DefaultOptions: IBatchRequestOptions = {
|
||||
domainOptions: {
|
||||
sameDomainParallelize: false,
|
||||
sameDomainDelayInMs: 3 /* sec */ * 1000,
|
||||
},
|
||||
requestOptions: {
|
||||
retryExponentialBaseInMs: 5 /* sec */ * 1000,
|
||||
additionalHeaders: {},
|
||||
},
|
||||
};
|
||||
|
||||
function requestAsync(urls: string[], options: IBatchRequestOptions): Promise<IUrlStatus[]> {
|
||||
if (!options.domainOptions.sameDomainParallelize) {
|
||||
return runOnEachDomainWithDelayAsync(
|
||||
urls,
|
||||
(url) => getUrlStatusAsync(url, options.requestOptions),
|
||||
options.domainOptions.sameDomainDelayInMs);
|
||||
} else {
|
||||
return Promise.all(
|
||||
urls.map((url) => getUrlStatusAsync(url, options.requestOptions)));
|
||||
}
|
||||
}
|
||||
|
||||
async function runOnEachDomainWithDelayAsync(
|
||||
urls: string[],
|
||||
action: (url: string) => Promise<IUrlStatus>,
|
||||
delayInMs: number): Promise<IUrlStatus[]> {
|
||||
const grouped = groupUrlsByDomain(urls);
|
||||
const tasks = grouped.map(async (group) => {
|
||||
const results = new Array<IUrlStatus>();
|
||||
for (const url of group) {
|
||||
const status = await action(url);
|
||||
results.push(status);
|
||||
if (results.length !== group.length) {
|
||||
await sleepAsync(delayInMs);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
});
|
||||
const r = await Promise.all(tasks);
|
||||
return r.flat();
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
import { sleepAsync } from '@/infrastructure/Threading/AsyncSleep';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
|
||||
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
|
||||
|
||||
export async function retryWithExponentialBackOffAsync(
|
||||
action: () => Promise<IUrlStatus>,
|
||||
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
|
||||
currentRetry = 1): Promise<IUrlStatus> {
|
||||
const maxTries: number = 3;
|
||||
const status = await action();
|
||||
if (shouldRetry(status)) {
|
||||
if (currentRetry <= maxTries) {
|
||||
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
|
||||
// tslint:disable-next-line: no-console
|
||||
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
|
||||
await sleepAsync(exponentialBackOffInMs);
|
||||
return retryWithExponentialBackOffAsync(action, baseRetryIntervalInMs, currentRetry + 1);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
function shouldRetry(status: IUrlStatus) {
|
||||
if (status.error) {
|
||||
return true;
|
||||
}
|
||||
return isTransientError(status.statusCode)
|
||||
|| status.statusCode === 429; // Too Many Requests
|
||||
}
|
||||
|
||||
function isTransientError(statusCode: number) {
|
||||
return statusCode >= 500 && statusCode <= 599;
|
||||
}
|
||||
|
||||
function getRetryTimeoutInMs(currentRetry: number, baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs) {
|
||||
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
|
||||
// of the exponentially increasing base amount
|
||||
const minRandom = 1 - retryRandomFactor;
|
||||
const maxRandom = 1 + retryRandomFactor;
|
||||
const randomization = (Math.random() * (maxRandom - minRandom)) + maxRandom;
|
||||
const exponential = Math.pow(2, currentRetry - 1);
|
||||
return Math.ceil(exponential * baseRetryIntervalInMs * randomization);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
export interface IUrlStatus {
|
||||
url: string;
|
||||
error?: any;
|
||||
statusCode?: number;
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler';
|
||||
import { IUrlStatus } from './IUrlStatus';
|
||||
import fetch from 'cross-fetch';
|
||||
|
||||
export interface IRequestOptions {
|
||||
retryExponentialBaseInMs?: number;
|
||||
additionalHeaders?: Record<string, string>;
|
||||
}
|
||||
|
||||
export async function getUrlStatusAsync(
|
||||
url: string,
|
||||
options: IRequestOptions = DefaultOptions): Promise<IUrlStatus> {
|
||||
options = { ...DefaultOptions, ...options };
|
||||
const fetchOptions = getFetchOptions(options);
|
||||
return retryWithExponentialBackOffAsync(async () => {
|
||||
console.log('Requesting', url); // tslint:disable-line: no-console
|
||||
try {
|
||||
const response = await fetch(url, fetchOptions);
|
||||
return { url, statusCode: response.status};
|
||||
} catch (err) {
|
||||
return { url, error: err};
|
||||
}
|
||||
}, options.retryExponentialBaseInMs);
|
||||
}
|
||||
|
||||
const DefaultOptions: IRequestOptions = {
|
||||
retryExponentialBaseInMs: 5000,
|
||||
additionalHeaders: {},
|
||||
};
|
||||
|
||||
function getFetchOptions(options: IRequestOptions) {
|
||||
return {
|
||||
method: 'GET',
|
||||
headers: { ...DefaultHeaders, ...options.additionalHeaders },
|
||||
};
|
||||
}
|
||||
|
||||
const DefaultHeaders: Record<string, string> = {
|
||||
/* Chrome on macOS */
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'connection': 'keep-alive',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'cache-control': 'max-age=0',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
};
|
||||
@@ -0,0 +1,19 @@
|
||||
export function groupUrlsByDomain(urls: string[]): string[][] {
|
||||
const domains = new Set<string>();
|
||||
const urlsWithDomain = urls.map((url) => ({
|
||||
url,
|
||||
domain: extractDomain(url),
|
||||
}));
|
||||
for (const url of urlsWithDomain) {
|
||||
domains.add(url.domain);
|
||||
}
|
||||
return Array.from(domains).map((domain) => {
|
||||
return urlsWithDomain
|
||||
.filter((url) => url.domain === domain)
|
||||
.map((url) => url.url);
|
||||
});
|
||||
}
|
||||
|
||||
function extractDomain(url: string): string {
|
||||
return url.split('://')[1].split('/')[0].toLowerCase();
|
||||
}
|
||||
Reference in New Issue
Block a user