Optimize filename parsing

This commit is contained in:
2025-12-30 11:30:56 -05:00
parent 32fa0e8539
commit c5b20c252f
3 changed files with 26 additions and 21 deletions

1
Cargo.lock generated
View File

@@ -1348,6 +1348,7 @@ dependencies = [
"is-terminal", "is-terminal",
"libc", "libc",
"num_cpus", "num_cpus",
"once_cell",
"owo-colors", "owo-colors",
"predicates", "predicates",
"rayon", "rayon",

View File

@@ -14,6 +14,7 @@ is-terminal = "0.4"
libc = "0.2" libc = "0.2"
num_cpus = "1.16" num_cpus = "1.16"
owo-colors = "4.1" owo-colors = "4.1"
once_cell = "1.19"
rayon = "1.10" rayon = "1.10"
regex = "1.10" regex = "1.10"
reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] } reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] }

View File

@@ -1,5 +1,7 @@
use std::collections::HashSet;
use std::path::Path; use std::path::Path;
use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
use crate::utils::{collapse_whitespace, normalize_title}; use crate::utils::{collapse_whitespace, normalize_title};
@@ -12,6 +14,24 @@ pub struct FileHints {
pub alt_titles: Vec<String>, pub alt_titles: Vec<String>,
} }
static YEAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(19|20)\d{2}").expect("year regex"));
static BRACKET_SQUARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\[[^\]]*\]").expect("square bracket regex"));
static BRACKET_ROUND_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\([^\)]*\)").expect("round bracket regex"));
static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
"hq", "cam", "ts", "dvdscr", "r5", "r6",
]
.into_iter()
.collect()
});
pub fn parse_filename(path: &Path) -> FileHints { pub fn parse_filename(path: &Path) -> FileHints {
let stem = path let stem = path
.file_stem() .file_stem()
@@ -44,9 +64,8 @@ pub fn parse_filename(path: &Path) -> FileHints {
} }
fn extract_year(raw: &str) -> Option<i32> { fn extract_year(raw: &str) -> Option<i32> {
let re = Regex::new(r"(19|20)\d{2}").ok()?;
let mut year: Option<i32> = None; let mut year: Option<i32> = None;
for mat in re.find_iter(raw) { for mat in YEAR_RE.find_iter(raw) {
if let Ok(parsed) = mat.as_str().parse::<i32>() { if let Ok(parsed) = mat.as_str().parse::<i32>() {
year = Some(parsed); year = Some(parsed);
} }
@@ -55,10 +74,8 @@ fn extract_year(raw: &str) -> Option<i32> {
} }
fn strip_bracketed(raw: &str) -> String { fn strip_bracketed(raw: &str) -> String {
let re_square = Regex::new(r"\[[^\]]*\]").unwrap(); let without_square = BRACKET_SQUARE_RE.replace_all(raw, " ");
let re_round = Regex::new(r"\([^\)]*\)").unwrap(); let without_round = BRACKET_ROUND_RE.replace_all(&without_square, " ");
let without_square = re_square.replace_all(raw, " ");
let without_round = re_round.replace_all(&without_square, " ");
without_round.to_string() without_round.to_string()
} }
@@ -83,7 +100,6 @@ fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
} }
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> { fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
let stopwords = stopwords();
let mut tokens = Vec::new(); let mut tokens = Vec::new();
for token in raw.split(|c: char| !c.is_alphanumeric()) { for token in raw.split(|c: char| !c.is_alphanumeric()) {
if token.is_empty() { if token.is_empty() {
@@ -95,7 +111,7 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
continue; continue;
} }
} }
if stopwords.contains(lower.as_str()) { if STOPWORDS.contains(lower.as_str()) {
continue; continue;
} }
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 { if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
@@ -106,19 +122,6 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
tokens tokens
} }
fn stopwords() -> std::collections::HashSet<&'static str> {
[
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
"hq", "cam", "ts", "dvdscr", "r5", "r6",
]
.into_iter()
.collect()
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::parse_filename; use super::parse_filename;