Add ruleset packs, linter, fixtures, and JSON schema

This commit is contained in:
2025-12-31 22:21:43 -05:00
parent dddac108fe
commit 21bb7cae5a
16 changed files with 475 additions and 174 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
/.local/
/target/
**/*.log
/tests/fixtures/generated/

View File

@@ -0,0 +1,29 @@
[[rule]]
id = "AAC_ADTS_HEADER_ERROR"
domain = "codec.aac"
severity = "medium"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Error parsing ADTS frame header", "(?i)Error decoding AAC frame header"]
notes = "AAC bitstream errors detected."
[[rule]]
id = "MP3_HEADER_MISSING"
domain = "codec.mp3"
severity = "medium"
confidence = 0.6
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Header missing"]
notes = "MP3 framing errors detected."
[[rule]]
id = "AC3_FRAME_SYNC_ERROR"
domain = "codec.ac3"
severity = "medium"
confidence = 0.6
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)frame sync error"]
notes = "AC-3 frame sync error detected."

View File

@@ -0,0 +1,39 @@
[[rule]]
id = "INVALID_NAL_UNIT_SIZE"
domain = "codec.h264"
severity = "severe"
confidence = 0.85
fix_tier = "reencode"
stop_scan = true
patterns = ["(?i)Invalid NAL unit size", "(?i)Error splitting the input into NAL units"]
notes = "H.264/HEVC bitstream corruption detected."
[[rule]]
id = "MISSING_PICTURE_ACCESS_UNIT"
domain = "codec.h264"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)missing picture in access unit"]
notes = "Missing picture in access unit; possible corruption."
[[rule]]
id = "PPS_ID_OUT_OF_RANGE"
domain = "codec.hevc"
severity = "high"
confidence = 0.75
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)PPS id out of range", "(?i)Error parsing NAL unit"]
notes = "HEVC parameter set corruption detected."
[[rule]]
id = "SEI_TRUNCATED"
domain = "codec.h264"
severity = "low"
confidence = 0.5
fix_tier = "none"
stop_scan = false
patterns = ["(?i)SEI type .* truncated"]
notes = "SEI message truncated; often benign unless paired with decode errors."

30
rulesets/containers.toml Normal file
View File

@@ -0,0 +1,30 @@
[[rule]]
id = "MOOV_ATOM_NOT_FOUND"
domain = "container.mp4"
severity = "severe"
confidence = 0.9
fix_tier = "reencode"
stop_scan = true
patterns = ["(?i)moov atom not found"]
notes = "MP4/MOV metadata is missing; file may be incomplete."
[[rule]]
id = "EBML_HEADER_PARSING_FAILED"
domain = "container.mkv"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)EBML header parsing failed"]
notes = "Matroska/WebM header parsing failed; file may be truncated."
[[rule]]
id = "FASTSTART_RECOMMENDED"
domain = "container.mp4"
severity = "low"
confidence = 0.4
fix_tier = "remux"
stop_scan = false
action = "faststart"
patterns = ["(?i)faststart"]
notes = "MP4 likely has moov atom at end; faststart remux recommended."

View File

@@ -1,170 +0,0 @@
[[rule]]
id = "MOOV_ATOM_NOT_FOUND"
domain = "container.mp4"
severity = "severe"
confidence = 0.9
fix_tier = "reencode"
stop_scan = true
patterns = ["(?i)moov atom not found"]
notes = "MP4/MOV metadata is missing; file may be incomplete."
[[rule]]
id = "COULD_NOT_FIND_CODEC_PARAMS"
domain = "probe"
severity = "medium"
confidence = 0.6
fix_tier = "none"
stop_scan = false
patterns = ["(?i)could not find codec parameters"]
notes = "Insufficient probe data; consider higher analyzeduration/probesize."
[[rule]]
id = "INVALID_DATA_FOUND"
domain = "decode"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Invalid data found when processing input"]
notes = "Decoder encountered invalid data; may indicate corruption."
[[rule]]
id = "FILE_ENDED_PREMATURELY"
domain = "decode"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)File ended prematurely"]
notes = "File appears truncated."
[[rule]]
id = "INVALID_NAL_UNIT_SIZE"
domain = "codec.h264"
severity = "severe"
confidence = 0.85
fix_tier = "reencode"
stop_scan = true
patterns = ["(?i)Invalid NAL unit size", "(?i)Error splitting the input into NAL units"]
notes = "H.264/HEVC bitstream corruption detected."
[[rule]]
id = "MISSING_PICTURE_ACCESS_UNIT"
domain = "codec.h264"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)missing picture in access unit"]
notes = "Missing picture in access unit; possible corruption."
[[rule]]
id = "PPS_ID_OUT_OF_RANGE"
domain = "codec.hevc"
severity = "high"
confidence = 0.75
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)PPS id out of range", "(?i)Error parsing NAL unit"]
notes = "HEVC parameter set corruption detected."
[[rule]]
id = "SEI_TRUNCATED"
domain = "codec.h264"
severity = "low"
confidence = 0.5
fix_tier = "none"
stop_scan = false
patterns = ["(?i)SEI type .* truncated"]
notes = "SEI message truncated; often benign unless paired with decode errors."
[[rule]]
id = "NON_MONOTONOUS_DTS"
domain = "timestamp"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)Non-monotonous DTS", "(?i)non monotonically increasing dts"]
notes = "Timestamp discontinuity detected."
[[rule]]
id = "DTS_DISCONTINUITY"
domain = "timestamp"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)DTS discontinuity"]
notes = "Timestamp discontinuity detected."
[[rule]]
id = "PES_PACKET_SIZE_MISMATCH"
domain = "transport.ts"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)PES packet size mismatch", "(?i)Packet corrupt"]
notes = "Transport stream corruption detected."
[[rule]]
id = "CONTINUITY_COUNTER_ERROR"
domain = "transport.ts"
severity = "low"
confidence = 0.4
fix_tier = "none"
stop_scan = false
patterns = ["(?i)continuity counter error"]
notes = "Continuity counter errors can be benign in segmented streams."
[[rule]]
id = "AAC_ADTS_HEADER_ERROR"
domain = "codec.aac"
severity = "medium"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Error parsing ADTS frame header", "(?i)Error decoding AAC frame header"]
notes = "AAC bitstream errors detected."
[[rule]]
id = "MP3_HEADER_MISSING"
domain = "codec.mp3"
severity = "medium"
confidence = 0.6
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Header missing"]
notes = "MP3 framing errors detected."
[[rule]]
id = "AC3_FRAME_SYNC_ERROR"
domain = "codec.ac3"
severity = "medium"
confidence = 0.6
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)frame sync error"]
notes = "AC-3 frame sync error detected."
[[rule]]
id = "EBML_HEADER_PARSING_FAILED"
domain = "container.mkv"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)EBML header parsing failed"]
notes = "Matroska/WebM header parsing failed; file may be truncated."
[[rule]]
id = "FASTSTART_RECOMMENDED"
domain = "container.mp4"
severity = "low"
confidence = 0.4
fix_tier = "remux"
stop_scan = false
action = "faststart"
patterns = ["(?i)faststart"]
notes = "MP4 likely has moov atom at end; faststart remux recommended."

19
rulesets/decode.toml Normal file
View File

@@ -0,0 +1,19 @@
[[rule]]
id = "INVALID_DATA_FOUND"
domain = "decode"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)Invalid data found when processing input"]
notes = "Decoder encountered invalid data; may indicate corruption."
[[rule]]
id = "FILE_ENDED_PREMATURELY"
domain = "decode"
severity = "high"
confidence = 0.7
fix_tier = "reencode"
stop_scan = false
patterns = ["(?i)File ended prematurely"]
notes = "File appears truncated."

9
rulesets/probe.toml Normal file
View File

@@ -0,0 +1,9 @@
[[rule]]
id = "COULD_NOT_FIND_CODEC_PARAMS"
domain = "probe"
severity = "medium"
confidence = 0.6
fix_tier = "none"
stop_scan = false
patterns = ["(?i)could not find codec parameters"]
notes = "Insufficient probe data; consider higher analyzeduration/probesize."

19
rulesets/timestamps.toml Normal file
View File

@@ -0,0 +1,19 @@
[[rule]]
id = "NON_MONOTONOUS_DTS"
domain = "timestamp"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)Non-monotonous DTS", "(?i)non monotonically increasing dts"]
notes = "Timestamp discontinuity detected."
[[rule]]
id = "DTS_DISCONTINUITY"
domain = "timestamp"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)DTS discontinuity"]
notes = "Timestamp discontinuity detected."

19
rulesets/transport.toml Normal file
View File

@@ -0,0 +1,19 @@
[[rule]]
id = "PES_PACKET_SIZE_MISMATCH"
domain = "transport.ts"
severity = "medium"
confidence = 0.6
fix_tier = "remux"
stop_scan = false
patterns = ["(?i)PES packet size mismatch", "(?i)Packet corrupt"]
notes = "Transport stream corruption detected."
[[rule]]
id = "CONTINUITY_COUNTER_ERROR"
domain = "transport.ts"
severity = "low"
confidence = 0.4
fix_tier = "none"
stop_scan = false
patterns = ["(?i)continuity counter error"]
notes = "Continuity counter errors can be benign in segmented streams."

22
scripts/generate_fixtures.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
OUT_DIR="$ROOT_DIR/tests/fixtures/generated"
mkdir -p "$OUT_DIR"
ffmpeg -y -hide_banner -loglevel error -f lavfi -i testsrc=size=128x72:rate=30 -f lavfi -i sine=frequency=1000:sample_rate=44100 -t 1 -c:v libx264 -pix_fmt yuv420p -c:a aac -movflags +faststart "$OUT_DIR/clean.mp4"
ffmpeg -y -hide_banner -loglevel error -f lavfi -i testsrc=size=128x72:rate=30 -f lavfi -i sine=frequency=1000:sample_rate=44100 -t 1 -c:v libx264 -pix_fmt yuv420p -c:a aac "$OUT_DIR/no_faststart.mp4"
# Create a truncated file to simulate partial download/corruption
cp "$OUT_DIR/clean.mp4" "$OUT_DIR/truncated.mp4"
python3 - <<'PY'
import os
path = "${OUT_DIR}/truncated.mp4"
size = os.path.getsize(path)
with open(path, "r+b") as f:
f.truncate(max(0, size - 2048))
PY
echo "Fixtures written to $OUT_DIR"

View File

@@ -4,4 +4,4 @@ mod types;
pub use json::render_json;
pub use text::{render_fix_line, render_scan_line, render_summary};
pub use types::{Report, ScanReport};
pub use types::{FixJsonReport, Report, ScanJsonReport, ScanReport, SCHEMA_VERSION};

View File

@@ -3,6 +3,8 @@ use serde::{Deserialize, Serialize};
use crate::fix::FixOutcome;
use crate::scan::ScanOutcome;
pub const SCHEMA_VERSION: &str = "1.0";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanReport {
pub scan: ScanOutcome,
@@ -13,3 +15,16 @@ pub struct Report {
pub scan: ScanOutcome,
pub fix: Option<FixOutcome>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanJsonReport {
pub schema_version: String,
pub scans: Vec<ScanOutcome>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FixJsonReport {
pub schema_version: String,
pub scans: Vec<ScanOutcome>,
pub fixes: Vec<FixOutcome>,
}

View File

@@ -0,0 +1,113 @@
use std::collections::{HashMap, HashSet};
use regex::Regex;
use super::model::{FixTier, Rule};
#[derive(Debug, Default, Clone)]
pub struct LintReport {
pub errors: Vec<String>,
pub warnings: Vec<String>,
}
impl LintReport {
pub fn has_errors(&self) -> bool {
!self.errors.is_empty()
}
}
pub fn lint_rules(rules: &[Rule]) -> LintReport {
let mut report = LintReport::default();
let mut ids = HashSet::new();
let mut pattern_map: HashMap<String, String> = HashMap::new();
for rule in rules {
if rule.id.trim().is_empty() {
report.errors.push("Rule id is empty".to_string());
}
if !ids.insert(rule.id.clone()) {
report
.errors
.push(format!("Duplicate rule id: {}", rule.id));
}
if rule.domain.trim().is_empty() {
report
.errors
.push(format!("Rule {} has empty domain", rule.id));
}
if rule.patterns.is_empty() {
report
.errors
.push(format!("Rule {} has no patterns", rule.id));
}
if !(0.0..=1.0).contains(&rule.confidence) {
report.errors.push(format!(
"Rule {} has invalid confidence {}",
rule.id, rule.confidence
));
}
if rule.stop_scan && rule.fix_tier != FixTier::Reencode {
report.errors.push(format!(
"Rule {} has stop_scan=true but fix_tier is {:?}",
rule.id, rule.fix_tier
));
}
if let Some(action) = &rule.action {
if action.eq_ignore_ascii_case("faststart") && rule.fix_tier == FixTier::Reencode {
report.warnings.push(format!(
"Rule {} uses faststart action but fix_tier is reencode",
rule.id
));
}
}
for pattern in &rule.patterns {
if let Err(err) = Regex::new(pattern) {
report.errors.push(format!(
"Rule {} has invalid regex '{}': {}",
rule.id, pattern, err
));
}
if let Some(existing) = pattern_map.get(pattern) {
if existing != &rule.id {
report.warnings.push(format!(
"Pattern '{}' appears in rules {} and {}",
pattern, existing, rule.id
));
}
} else {
pattern_map.insert(pattern.clone(), rule.id.clone());
}
}
}
report
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::model::{FixTier, Rule, Severity};
#[test]
fn detects_duplicate_ids() {
let rule = Rule {
id: "DUP".to_string(),
domain: "test".to_string(),
severity: Severity::Low,
confidence: 0.5,
fix_tier: FixTier::None,
stop_scan: false,
patterns: vec!["foo".to_string()],
notes: None,
action: None,
requires: vec![],
excludes: vec![],
};
let report = lint_rules(&[rule.clone(), rule]);
assert!(report.has_errors());
}
}

View File

@@ -5,9 +5,13 @@ use anyhow::Result;
use crate::scan::ProbeData;
mod loader;
mod lint;
mod matcher;
mod model;
use model::Rule;
pub use lint::{lint_rules, LintReport};
pub use matcher::{RuleContext, RuleMatch};
pub use model::{FixTier, Severity};
@@ -41,6 +45,12 @@ impl RuleSet {
Ok(Self { rules: Vec::new() })
}
pub fn load_from_dir(dir: &std::path::Path) -> Result<Self> {
let rules = loader::load_rules_from_dir(dir)?;
let compiled = loader::compile_rules(rules)?;
Ok(Self { rules: compiled })
}
pub fn match_lines(&self, lines: &[String], context: &RuleContext) -> Vec<RuleMatch> {
let mut matches = Vec::new();
for rule in &self.rules {
@@ -61,6 +71,29 @@ impl RuleSet {
}
}
pub fn load_raw_rules() -> Result<Vec<Rule>> {
let mut candidates = Vec::new();
if let Ok(current) = std::env::current_dir() {
candidates.push(current.join("rulesets"));
}
if let Ok(exe) = std::env::current_exe() {
if let Some(parent) = exe.parent() {
candidates.push(parent.join("rulesets"));
}
}
for dir in candidates {
let rules = loader::load_rules_from_dir(&dir)?;
if !rules.is_empty() {
return Ok(rules);
}
}
Ok(Vec::new())
}
pub fn build_context(probe: &ProbeData) -> RuleContext {
let mut context = RuleContext::default();

View File

@@ -0,0 +1,76 @@
use std::path::PathBuf;
use std::process::Command;
use tempfile::tempdir;
use vid_repair_core::config::Config;
use vid_repair_core::rules::RuleSet;
use vid_repair_core::scan::scan_file;
fn command_available(cmd: &str) -> bool {
Command::new(cmd)
.arg("-version")
.output()
.map(|out| out.status.success())
.unwrap_or(false)
}
fn ruleset_dir() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("workspace root")
.join("rulesets")
}
#[test]
fn scan_clean_fixture_has_no_issues() {
if !command_available("ffmpeg") || !command_available("ffprobe") {
eprintln!("ffmpeg/ffprobe not available; skipping fixture test");
return;
}
let dir = tempdir().expect("tempdir");
let output = dir.path().join("clean.mp4");
let status = Command::new("ffmpeg")
.args([
"-y",
"-hide_banner",
"-loglevel",
"error",
"-f",
"lavfi",
"-i",
"testsrc=size=128x72:rate=30",
"-f",
"lavfi",
"-i",
"sine=frequency=1000:sample_rate=44100",
"-t",
"1",
"-c:v",
"libx264",
"-pix_fmt",
"yuv420p",
"-c:a",
"aac",
"-movflags",
"+faststart",
output.to_str().unwrap(),
])
.status()
.expect("ffmpeg run");
if !status.success() {
eprintln!("ffmpeg failed to create fixture");
return;
}
let config = Config::default();
let ruleset = RuleSet::load_from_dir(&ruleset_dir()).expect("ruleset load");
let scan = scan_file(&output, &config, &ruleset).expect("scan file");
assert!(scan.issues.is_empty(), "Expected no issues, got {}", scan.issues.len());
}

View File

@@ -7,7 +7,10 @@ use rayon::ThreadPoolBuilder;
use vid_repair_core::config::{Config, ConfigOverrides, FixPolicy, ScanDepth};
use vid_repair_core::fix::{self, FixOutcome};
use vid_repair_core::report::{render_fix_line, render_json, render_scan_line, render_summary};
use vid_repair_core::report::{
render_fix_line, render_json, render_scan_line, render_summary, FixJsonReport,
ScanJsonReport, SCHEMA_VERSION,
};
use vid_repair_core::rules::{ensure_ruleset_loaded, RuleSet};
use vid_repair_core::scan::{scan_file, ScanOutcome};
use vid_repair_core::{fs, watch};
@@ -55,6 +58,7 @@ enum Commands {
Fix(FixArgs),
Report(ScanArgs),
Config(ConfigArgs),
Rules(RulesArgs),
}
#[derive(ValueEnum, Debug, Clone, Copy)]
@@ -149,6 +153,17 @@ struct ConfigArgs {
command: ConfigCommand,
}
#[derive(Parser, Debug)]
struct RulesArgs {
#[command(subcommand)]
command: RulesCommand,
}
#[derive(Subcommand, Debug)]
enum RulesCommand {
Lint,
}
#[derive(Subcommand, Debug)]
enum ConfigCommand {
Init {
@@ -180,6 +195,7 @@ fn main() -> Result<()> {
match command {
Commands::Config(args) => handle_config(args, common.config.clone()),
Commands::Rules(args) => handle_rules(args),
Commands::Scan(args) => handle_scan(args, &common),
Commands::Report(args) => handle_report(args, &common),
Commands::Fix(args) => handle_fix(args, &common),
@@ -197,6 +213,29 @@ fn handle_config(args: ConfigArgs, path: Option<PathBuf>) -> Result<()> {
}
}
fn handle_rules(args: RulesArgs) -> Result<()> {
match args.command {
RulesCommand::Lint => {
let rules = vid_repair_core::rules::load_raw_rules()?;
if rules.is_empty() {
anyhow::bail!("No rules found to lint.");
}
let report = vid_repair_core::rules::lint_rules(&rules);
for warning in &report.warnings {
eprintln!("[WARN] {}", warning);
}
for error in &report.errors {
eprintln!("[ERROR] {}", error);
}
if report.has_errors() {
anyhow::bail!("Ruleset lint failed with {} errors", report.errors.len());
}
println!("Ruleset lint OK ({} warnings)", report.warnings.len());
Ok(())
}
}
}
fn handle_scan(args: ScanArgs, common: &CommonArgs) -> Result<()> {
let (mut config, _config_path) = Config::load_or_init(common.config.clone())?;
let mut overrides = ConfigOverrides::default();
@@ -238,7 +277,11 @@ fn handle_scan(args: ScanArgs, common: &CommonArgs) -> Result<()> {
let scans = run_scans(files, &config, &ruleset)?;
if config.report.json {
let json = render_json(&scans, config.report.pretty)?;
let payload = ScanJsonReport {
schema_version: SCHEMA_VERSION.to_string(),
scans,
};
let json = render_json(&payload, config.report.pretty)?;
println!("{}", json);
} else {
for scan in &scans {
@@ -298,7 +341,11 @@ fn handle_fix(args: FixArgs, common: &CommonArgs) -> Result<()> {
let (scans, fixes) = run_fixes(files, &config, &ruleset, args.dry_run)?;
if config.report.json {
let payload = serde_json::json!({ "scans": scans, "fixes": fixes });
let payload = FixJsonReport {
schema_version: SCHEMA_VERSION.to_string(),
scans,
fixes,
};
let json = render_json(&payload, config.report.pretty)?;
println!("{}", json);
} else {