Initial commit

This commit is contained in:
2025-12-30 10:51:50 -05:00
parent 12315c4925
commit 3c0c022c79
21 changed files with 6689 additions and 1 deletions

150
src/parse.rs Normal file
View File

@@ -0,0 +1,150 @@
use std::path::Path;
use regex::Regex;
use crate::utils::{collapse_whitespace, normalize_title};
#[derive(Debug, Clone)]
pub struct FileHints {
pub title: Option<String>,
pub normalized_title: Option<String>,
pub year: Option<i32>,
pub alt_titles: Vec<String>,
}
pub fn parse_filename(path: &Path) -> FileHints {
let stem = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_default();
let year = extract_year(&stem);
let cleaned = strip_bracketed(&stem);
let alt_titles = extract_alt_titles(&cleaned, year);
let tokens = tokenize(&cleaned, year);
let title = if tokens.is_empty() {
let mut fallback = cleaned.clone();
if let Some(year) = year {
fallback = fallback.replace(&year.to_string(), "");
}
let fallback = collapse_whitespace(&fallback);
if fallback.is_empty() { None } else { Some(fallback) }
} else {
Some(collapse_whitespace(&tokens.join(" ")))
};
let normalized_title = title.as_deref().map(normalize_title);
FileHints {
title,
normalized_title,
year,
alt_titles,
}
}
fn extract_year(raw: &str) -> Option<i32> {
let re = Regex::new(r"(19|20)\d{2}").ok()?;
let mut year: Option<i32> = None;
for mat in re.find_iter(raw) {
if let Ok(parsed) = mat.as_str().parse::<i32>() {
year = Some(parsed);
}
}
year
}
fn strip_bracketed(raw: &str) -> String {
let re_square = Regex::new(r"\[[^\]]*\]").unwrap();
let re_round = Regex::new(r"\([^\)]*\)").unwrap();
let without_square = re_square.replace_all(raw, " ");
let without_round = re_round.replace_all(&without_square, " ");
without_round.to_string()
}
fn extract_alt_titles(raw: &str, year: Option<i32>) -> Vec<String> {
let mut alt_titles = Vec::new();
if let Some((left, right)) = raw.split_once(" - ") {
let left = clean_title_fragment(left, year);
let right = collapse_whitespace(right);
if !left.is_empty() && !right.is_empty() {
alt_titles.push(left);
}
}
alt_titles
}
fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
let mut cleaned = fragment.to_string();
if let Some(year) = year {
cleaned = cleaned.replace(&year.to_string(), " ");
}
collapse_whitespace(&cleaned)
}
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
let stopwords = stopwords();
let mut tokens = Vec::new();
for token in raw.split(|c: char| !c.is_alphanumeric()) {
if token.is_empty() {
continue;
}
let lower = token.to_ascii_lowercase();
if let Some(year) = year {
if lower == year.to_string() {
continue;
}
}
if stopwords.contains(lower.as_str()) {
continue;
}
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
continue;
}
tokens.push(token.to_string());
}
tokens
}
fn stopwords() -> std::collections::HashSet<&'static str> {
[
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
"hq", "cam", "ts", "dvdscr", "r5", "r6",
]
.into_iter()
.collect()
}
#[cfg(test)]
mod tests {
use super::parse_filename;
use std::path::Path;
#[test]
fn parses_basic_title_and_year() {
let path = Path::new("Some.Movie.2020.1080p.BluRay.x264-GROUP.mkv");
let hints = parse_filename(path);
assert_eq!(hints.title.as_deref(), Some("Some Movie"));
assert_eq!(hints.year, Some(2020));
}
#[test]
fn handles_brackets_and_stopwords() {
let path = Path::new("[YTS] The.Matrix.(1999).1080p.BluRay.mkv");
let hints = parse_filename(path);
assert_eq!(hints.title.as_deref(), Some("The Matrix"));
assert_eq!(hints.year, Some(1999));
}
#[test]
fn adds_alt_title_for_dash_suffix() {
let path = Path::new("Zootopia - Vlix.mp4");
let hints = parse_filename(path);
assert_eq!(hints.title.as_deref(), Some("Zootopia Vlix"));
assert!(hints.alt_titles.iter().any(|t| t == "Zootopia"));
}
}