Initial commit
This commit is contained in:
150
src/parse.rs
Normal file
150
src/parse.rs
Normal file
@@ -0,0 +1,150 @@
|
||||
use std::path::Path;
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
use crate::utils::{collapse_whitespace, normalize_title};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FileHints {
|
||||
pub title: Option<String>,
|
||||
pub normalized_title: Option<String>,
|
||||
pub year: Option<i32>,
|
||||
pub alt_titles: Vec<String>,
|
||||
}
|
||||
|
||||
pub fn parse_filename(path: &Path) -> FileHints {
|
||||
let stem = path
|
||||
.file_stem()
|
||||
.map(|s| s.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let year = extract_year(&stem);
|
||||
let cleaned = strip_bracketed(&stem);
|
||||
let alt_titles = extract_alt_titles(&cleaned, year);
|
||||
let tokens = tokenize(&cleaned, year);
|
||||
|
||||
let title = if tokens.is_empty() {
|
||||
let mut fallback = cleaned.clone();
|
||||
if let Some(year) = year {
|
||||
fallback = fallback.replace(&year.to_string(), "");
|
||||
}
|
||||
let fallback = collapse_whitespace(&fallback);
|
||||
if fallback.is_empty() { None } else { Some(fallback) }
|
||||
} else {
|
||||
Some(collapse_whitespace(&tokens.join(" ")))
|
||||
};
|
||||
let normalized_title = title.as_deref().map(normalize_title);
|
||||
|
||||
FileHints {
|
||||
title,
|
||||
normalized_title,
|
||||
year,
|
||||
alt_titles,
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_year(raw: &str) -> Option<i32> {
|
||||
let re = Regex::new(r"(19|20)\d{2}").ok()?;
|
||||
let mut year: Option<i32> = None;
|
||||
for mat in re.find_iter(raw) {
|
||||
if let Ok(parsed) = mat.as_str().parse::<i32>() {
|
||||
year = Some(parsed);
|
||||
}
|
||||
}
|
||||
year
|
||||
}
|
||||
|
||||
fn strip_bracketed(raw: &str) -> String {
|
||||
let re_square = Regex::new(r"\[[^\]]*\]").unwrap();
|
||||
let re_round = Regex::new(r"\([^\)]*\)").unwrap();
|
||||
let without_square = re_square.replace_all(raw, " ");
|
||||
let without_round = re_round.replace_all(&without_square, " ");
|
||||
without_round.to_string()
|
||||
}
|
||||
|
||||
fn extract_alt_titles(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||
let mut alt_titles = Vec::new();
|
||||
if let Some((left, right)) = raw.split_once(" - ") {
|
||||
let left = clean_title_fragment(left, year);
|
||||
let right = collapse_whitespace(right);
|
||||
if !left.is_empty() && !right.is_empty() {
|
||||
alt_titles.push(left);
|
||||
}
|
||||
}
|
||||
alt_titles
|
||||
}
|
||||
|
||||
fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
|
||||
let mut cleaned = fragment.to_string();
|
||||
if let Some(year) = year {
|
||||
cleaned = cleaned.replace(&year.to_string(), " ");
|
||||
}
|
||||
collapse_whitespace(&cleaned)
|
||||
}
|
||||
|
||||
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||
let stopwords = stopwords();
|
||||
let mut tokens = Vec::new();
|
||||
for token in raw.split(|c: char| !c.is_alphanumeric()) {
|
||||
if token.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let lower = token.to_ascii_lowercase();
|
||||
if let Some(year) = year {
|
||||
if lower == year.to_string() {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if stopwords.contains(lower.as_str()) {
|
||||
continue;
|
||||
}
|
||||
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
|
||||
continue;
|
||||
}
|
||||
tokens.push(token.to_string());
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
fn stopwords() -> std::collections::HashSet<&'static str> {
|
||||
[
|
||||
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
||||
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
||||
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
||||
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
||||
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
||||
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::parse_filename;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn parses_basic_title_and_year() {
|
||||
let path = Path::new("Some.Movie.2020.1080p.BluRay.x264-GROUP.mkv");
|
||||
let hints = parse_filename(path);
|
||||
assert_eq!(hints.title.as_deref(), Some("Some Movie"));
|
||||
assert_eq!(hints.year, Some(2020));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_brackets_and_stopwords() {
|
||||
let path = Path::new("[YTS] The.Matrix.(1999).1080p.BluRay.mkv");
|
||||
let hints = parse_filename(path);
|
||||
assert_eq!(hints.title.as_deref(), Some("The Matrix"));
|
||||
assert_eq!(hints.year, Some(1999));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn adds_alt_title_for_dash_suffix() {
|
||||
let path = Path::new("Zootopia - Vlix.mp4");
|
||||
let hints = parse_filename(path);
|
||||
assert_eq!(hints.title.as_deref(), Some("Zootopia Vlix"));
|
||||
assert!(hints.alt_titles.iter().any(|t| t == "Zootopia"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user