181 lines
5.6 KiB
Rust
181 lines
5.6 KiB
Rust
use std::collections::HashSet;
|
|
use std::path::Path;
|
|
|
|
use once_cell::sync::Lazy;
|
|
use regex::Regex;
|
|
|
|
use crate::utils::{collapse_whitespace, normalize_title};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct FileHints {
|
|
pub title: Option<String>,
|
|
pub normalized_title: Option<String>,
|
|
pub year: Option<i32>,
|
|
pub alt_titles: Vec<String>,
|
|
}
|
|
|
|
static YEAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(19|20)\d{2}").expect("year regex"));
|
|
static BRACKET_SQUARE_RE: Lazy<Regex> =
|
|
Lazy::new(|| Regex::new(r"\[[^\]]*\]").expect("square bracket regex"));
|
|
static BRACKET_ROUND_RE: Lazy<Regex> =
|
|
Lazy::new(|| Regex::new(r"\([^\)]*\)").expect("round bracket regex"));
|
|
static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
[
|
|
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
|
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
|
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
|
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
|
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
|
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
|
]
|
|
.into_iter()
|
|
.collect()
|
|
});
|
|
|
|
pub fn parse_filename(path: &Path) -> FileHints {
|
|
let stem = path
|
|
.file_stem()
|
|
.map(|s| s.to_string_lossy().to_string())
|
|
.unwrap_or_default();
|
|
|
|
let year = extract_year(&stem);
|
|
let cleaned = strip_bracketed(&stem);
|
|
let alt_titles = extract_alt_titles(&cleaned, year);
|
|
let tokens = tokenize(&cleaned, year);
|
|
|
|
let title = if tokens.is_empty() {
|
|
let mut fallback = cleaned.clone();
|
|
if let Some(year) = year {
|
|
fallback = fallback.replace(&year.to_string(), "");
|
|
}
|
|
let fallback = collapse_whitespace(&fallback);
|
|
if fallback.is_empty() { None } else { Some(fallback) }
|
|
} else {
|
|
Some(collapse_whitespace(&tokens.join(" ")))
|
|
};
|
|
let normalized_title = title.as_deref().map(normalize_title);
|
|
|
|
FileHints {
|
|
title,
|
|
normalized_title,
|
|
year,
|
|
alt_titles,
|
|
}
|
|
}
|
|
|
|
fn extract_year(raw: &str) -> Option<i32> {
|
|
let mut year: Option<i32> = None;
|
|
for mat in YEAR_RE.find_iter(raw) {
|
|
if let Ok(parsed) = mat.as_str().parse::<i32>() {
|
|
year = Some(parsed);
|
|
}
|
|
}
|
|
year
|
|
}
|
|
|
|
fn strip_bracketed(raw: &str) -> String {
|
|
let without_square = BRACKET_SQUARE_RE.replace_all(raw, " ");
|
|
let without_round = BRACKET_ROUND_RE.replace_all(&without_square, " ");
|
|
without_round.to_string()
|
|
}
|
|
|
|
fn extract_alt_titles(raw: &str, year: Option<i32>) -> Vec<String> {
|
|
let mut alt_titles = Vec::new();
|
|
if let Some((left, right)) = raw.split_once(" - ") {
|
|
let left = clean_title_fragment(left, year);
|
|
let right = collapse_whitespace(right);
|
|
if !left.is_empty() && !right.is_empty() {
|
|
alt_titles.push(left);
|
|
}
|
|
}
|
|
alt_titles
|
|
}
|
|
|
|
fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
|
|
let mut cleaned = fragment.to_string();
|
|
if let Some(year) = year {
|
|
cleaned = cleaned.replace(&year.to_string(), " ");
|
|
}
|
|
collapse_whitespace(&cleaned)
|
|
}
|
|
|
|
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
|
let mut tokens = Vec::new();
|
|
for token in raw.split(|c: char| !c.is_alphanumeric()) {
|
|
if token.is_empty() {
|
|
continue;
|
|
}
|
|
let lower = token.to_ascii_lowercase();
|
|
if let Some(year) = year {
|
|
if lower == year.to_string() {
|
|
continue;
|
|
}
|
|
}
|
|
if STOPWORDS.contains(lower.as_str()) {
|
|
continue;
|
|
}
|
|
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
|
|
continue;
|
|
}
|
|
tokens.push(token.to_string());
|
|
}
|
|
tokens
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::parse_filename;
|
|
use std::path::Path;
|
|
|
|
#[test]
|
|
fn parses_basic_title_and_year() {
|
|
let path = Path::new("Some.Movie.2020.1080p.BluRay.x264-GROUP.mkv");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(hints.title.as_deref(), Some("Some Movie"));
|
|
assert_eq!(hints.year, Some(2020));
|
|
}
|
|
|
|
#[test]
|
|
fn handles_brackets_and_stopwords() {
|
|
let path = Path::new("[YTS] The.Matrix.(1999).1080p.BluRay.mkv");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(hints.title.as_deref(), Some("The Matrix"));
|
|
assert_eq!(hints.year, Some(1999));
|
|
}
|
|
|
|
#[test]
|
|
fn adds_alt_title_for_dash_suffix() {
|
|
let path = Path::new("Zootopia - Vlix.mp4");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(hints.title.as_deref(), Some("Zootopia Vlix"));
|
|
assert!(hints.alt_titles.iter().any(|t| t == "Zootopia"));
|
|
}
|
|
|
|
#[test]
|
|
fn handles_foreign_title_ascii() {
|
|
let path = Path::new("Cidade.de.Deus.2002.1080p.BluRay.x264.mkv");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(hints.title.as_deref(), Some("Cidade de Deus"));
|
|
assert_eq!(hints.year, Some(2002));
|
|
}
|
|
|
|
#[test]
|
|
fn strips_release_tags() {
|
|
let path = Path::new("Movie.Title.2019.1080p.HDRip.x264.AAC.mkv");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(hints.title.as_deref(), Some("Movie Title"));
|
|
assert_eq!(hints.year, Some(2019));
|
|
}
|
|
|
|
#[test]
|
|
fn handles_subtitle_separator() {
|
|
let path = Path::new("Doctor.Strange.In.The.Multiverse.of.Madness.2022.2160p.mkv");
|
|
let hints = parse_filename(path);
|
|
assert_eq!(
|
|
hints.title.as_deref(),
|
|
Some("Doctor Strange In The Multiverse of Madness")
|
|
);
|
|
assert_eq!(hints.year, Some(2022));
|
|
}
|
|
}
|