fix broken URLs and automate broken URL checks #70

This commit:
- Fixes broken URLs using archive.org or other references.
- Replaces tenforums.com URLs with better documentation as they tend to return HTTP status code 403 to tests and also are low quality source.
- Changes all insecure http sources to https alternatives
- Adds integration tests to check for broken URLs
  - There's logic implemented for having a delay inbetween when sending requests to same domains, however it's not used as the sources can respond to totally parallelized requests.
- Run test pipeline weekly to get notified about broken URls without commits
This commit is contained in:
undergroundwires
2021-05-05 23:52:06 +02:00
parent 36f0805590
commit db62ed7f3a
14 changed files with 307 additions and 27 deletions

View File

@@ -0,0 +1,48 @@
import 'mocha';
import { expect } from 'chai';
import { parseApplication } from '@/application/Parser/ApplicationParser';
import { IApplication } from '@/domain/IApplication';
import { IUrlStatus } from './StatusChecker/IUrlStatus';
import { getUrlStatusesInParallelAsync, IBatchRequestOptions } from './StatusChecker/BatchStatusChecker';
describe('collections', () => {
// arrange
const app = parseApplication();
const urls = collectUniqueUrls(app);
const options: IBatchRequestOptions = {
domainOptions: {
sameDomainParallelize: true, // no need to be so nice until sources start failing
// sameDomainDelayInMs: 2 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 3 /* sec */ * 1000,
additionalHeaders: { referer: app.info.homepage },
},
};
const testTimeoutInMs = urls.length * 60000 /* 1 minute */;
it('have no dead urls', async () => {
// act
const results = await getUrlStatusesInParallelAsync(urls, options);
// assert
const deadUrls = results.filter((r) => r.statusCode !== 200);
expect(deadUrls).to.have.lengthOf(0, printUrls(deadUrls));
}).timeout(testTimeoutInMs);
});
function collectUniqueUrls(app: IApplication): string[] {
return app
.collections
.flatMap((a) => a.getAllScripts())
.flatMap((script) => script.documentationUrls)
.filter((url, index, array) => array.indexOf(url) === index);
}
function printUrls(statuses: IUrlStatus[]): string {
return '\n' +
statuses.map((status) =>
`- ${status.url}\n` +
(status.statusCode ? `\tResponse code: ${status.statusCode}` : '') +
(status.error ? `\tException: ${JSON.stringify(status.error, null, '\t')}` : ''))
.join(`\n`)
+ '\n';
}

View File

@@ -0,0 +1,67 @@
import { sleepAsync } from '@/infrastructure/Threading/AsyncSleep';
import { IUrlStatus } from './IUrlStatus';
import { getUrlStatusAsync, IRequestOptions } from './Requestor';
import { groupUrlsByDomain } from './UrlPerDomainGrouper';
export async function getUrlStatusesInParallelAsync(
urls: string[],
options?: IBatchRequestOptions): Promise<IUrlStatus[]> {
const uniqueUrls = Array.from(new Set(urls));
options = { ...DefaultOptions, ...options };
console.log('Options: ', options); // tslint:disable-line: no-console
const results = await requestAsync(uniqueUrls, options);
return results;
}
export interface IBatchRequestOptions {
domainOptions?: IDomainOptions;
requestOptions?: IRequestOptions;
}
interface IDomainOptions {
sameDomainParallelize?: boolean;
sameDomainDelayInMs?: number;
}
const DefaultOptions: IBatchRequestOptions = {
domainOptions: {
sameDomainParallelize: false,
sameDomainDelayInMs: 3 /* sec */ * 1000,
},
requestOptions: {
retryExponentialBaseInMs: 5 /* sec */ * 1000,
additionalHeaders: {},
},
};
function requestAsync(urls: string[], options: IBatchRequestOptions): Promise<IUrlStatus[]> {
if (!options.domainOptions.sameDomainParallelize) {
return runOnEachDomainWithDelayAsync(
urls,
(url) => getUrlStatusAsync(url, options.requestOptions),
options.domainOptions.sameDomainDelayInMs);
} else {
return Promise.all(
urls.map((url) => getUrlStatusAsync(url, options.requestOptions)));
}
}
async function runOnEachDomainWithDelayAsync(
urls: string[],
action: (url: string) => Promise<IUrlStatus>,
delayInMs: number): Promise<IUrlStatus[]> {
const grouped = groupUrlsByDomain(urls);
const tasks = grouped.map(async (group) => {
const results = new Array<IUrlStatus>();
for (const url of group) {
const status = await action(url);
results.push(status);
if (results.length !== group.length) {
await sleepAsync(delayInMs);
}
}
return results;
});
const r = await Promise.all(tasks);
return r.flat();
}

View File

@@ -0,0 +1,44 @@
import { sleepAsync } from '@/infrastructure/Threading/AsyncSleep';
import { IUrlStatus } from './IUrlStatus';
const DefaultBaseRetryIntervalInMs = 5 /* sec */ * 1000;
export async function retryWithExponentialBackOffAsync(
action: () => Promise<IUrlStatus>,
baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs,
currentRetry = 1): Promise<IUrlStatus> {
const maxTries: number = 3;
const status = await action();
if (shouldRetry(status)) {
if (currentRetry <= maxTries) {
const exponentialBackOffInMs = getRetryTimeoutInMs(currentRetry, baseRetryIntervalInMs);
// tslint:disable-next-line: no-console
console.log(`Retrying (${currentRetry}) in ${exponentialBackOffInMs / 1000} seconds`, status);
await sleepAsync(exponentialBackOffInMs);
return retryWithExponentialBackOffAsync(action, baseRetryIntervalInMs, currentRetry + 1);
}
}
return status;
}
function shouldRetry(status: IUrlStatus) {
if (status.error) {
return true;
}
return isTransientError(status.statusCode)
|| status.statusCode === 429; // Too Many Requests
}
function isTransientError(statusCode: number) {
return statusCode >= 500 && statusCode <= 599;
}
function getRetryTimeoutInMs(currentRetry: number, baseRetryIntervalInMs: number = DefaultBaseRetryIntervalInMs) {
const retryRandomFactor = 0.5; // Retry intervals are between 50% and 150%
// of the exponentially increasing base amount
const minRandom = 1 - retryRandomFactor;
const maxRandom = 1 + retryRandomFactor;
const randomization = (Math.random() * (maxRandom - minRandom)) + maxRandom;
const exponential = Math.pow(2, currentRetry - 1);
return Math.ceil(exponential * baseRetryIntervalInMs * randomization);
}

View File

@@ -0,0 +1,5 @@
export interface IUrlStatus {
url: string;
error?: any;
statusCode?: number;
}

View File

@@ -0,0 +1,47 @@
import { retryWithExponentialBackOffAsync } from './ExponentialBackOffRetryHandler';
import { IUrlStatus } from './IUrlStatus';
import fetch from 'cross-fetch';
export interface IRequestOptions {
retryExponentialBaseInMs?: number;
additionalHeaders?: Record<string, string>;
}
export async function getUrlStatusAsync(
url: string,
options: IRequestOptions = DefaultOptions): Promise<IUrlStatus> {
options = { ...DefaultOptions, ...options };
const fetchOptions = getFetchOptions(options);
return retryWithExponentialBackOffAsync(async () => {
console.log('Requesting', url); // tslint:disable-line: no-console
try {
const response = await fetch(url, fetchOptions);
return { url, statusCode: response.status};
} catch (err) {
return { url, error: err};
}
}, options.retryExponentialBaseInMs);
}
const DefaultOptions: IRequestOptions = {
retryExponentialBaseInMs: 5000,
additionalHeaders: {},
};
function getFetchOptions(options: IRequestOptions) {
return {
method: 'GET',
headers: { ...DefaultHeaders, ...options.additionalHeaders },
};
}
const DefaultHeaders: Record<string, string> = {
/* Chrome on macOS */
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'upgrade-insecure-requests': '1',
'connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'accept-language': 'en-US,en;q=0.9',
};

View File

@@ -0,0 +1,19 @@
export function groupUrlsByDomain(urls: string[]): string[][] {
const domains = new Set<string>();
const urlsWithDomain = urls.map((url) => ({
url,
domain: extractDomain(url),
}));
for (const url of urlsWithDomain) {
domains.add(url.domain);
}
return Array.from(domains).map((domain) => {
return urlsWithDomain
.filter((url) => url.domain === domain)
.map((url) => url.url);
});
}
function extractDomain(url: string): string {
return url.split('://')[1].split('/')[0].toLowerCase();
}