diff --git a/Cargo.lock b/Cargo.lock index 4b764ea..8bf7b12 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1348,6 +1348,7 @@ dependencies = [ "is-terminal", "libc", "num_cpus", + "once_cell", "owo-colors", "predicates", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 20e9b29..cf2bf5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ is-terminal = "0.4" libc = "0.2" num_cpus = "1.16" owo-colors = "4.1" +once_cell = "1.19" rayon = "1.10" regex = "1.10" reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] } diff --git a/src/parse.rs b/src/parse.rs index 2b9ea1a..48bcf29 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,5 +1,7 @@ +use std::collections::HashSet; use std::path::Path; +use once_cell::sync::Lazy; use regex::Regex; use crate::utils::{collapse_whitespace, normalize_title}; @@ -12,6 +14,24 @@ pub struct FileHints { pub alt_titles: Vec, } +static YEAR_RE: Lazy = Lazy::new(|| Regex::new(r"(19|20)\d{2}").expect("year regex")); +static BRACKET_SQUARE_RE: Lazy = + Lazy::new(|| Regex::new(r"\[[^\]]*\]").expect("square bracket regex")); +static BRACKET_ROUND_RE: Lazy = + Lazy::new(|| Regex::new(r"\([^\)]*\)").expect("round bracket regex")); +static STOPWORDS: Lazy> = Lazy::new(|| { + [ + "1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip", + "bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip", + "remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3", + "proper", "repack", "limited", "extended", "uncut", "remastered", "subbed", + "subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd", + "hq", "cam", "ts", "dvdscr", "r5", "r6", + ] + .into_iter() + .collect() +}); + pub fn parse_filename(path: &Path) -> FileHints { let stem = path .file_stem() @@ -44,9 +64,8 @@ pub fn parse_filename(path: &Path) -> FileHints { } fn extract_year(raw: &str) -> Option { - let re = Regex::new(r"(19|20)\d{2}").ok()?; let mut year: Option = None; - for mat in re.find_iter(raw) { + for mat in YEAR_RE.find_iter(raw) { if let Ok(parsed) = mat.as_str().parse::() { year = Some(parsed); } @@ -55,10 +74,8 @@ fn extract_year(raw: &str) -> Option { } fn strip_bracketed(raw: &str) -> String { - let re_square = Regex::new(r"\[[^\]]*\]").unwrap(); - let re_round = Regex::new(r"\([^\)]*\)").unwrap(); - let without_square = re_square.replace_all(raw, " "); - let without_round = re_round.replace_all(&without_square, " "); + let without_square = BRACKET_SQUARE_RE.replace_all(raw, " "); + let without_round = BRACKET_ROUND_RE.replace_all(&without_square, " "); without_round.to_string() } @@ -83,7 +100,6 @@ fn clean_title_fragment(fragment: &str, year: Option) -> String { } fn tokenize(raw: &str, year: Option) -> Vec { - let stopwords = stopwords(); let mut tokens = Vec::new(); for token in raw.split(|c: char| !c.is_alphanumeric()) { if token.is_empty() { @@ -95,7 +111,7 @@ fn tokenize(raw: &str, year: Option) -> Vec { continue; } } - if stopwords.contains(lower.as_str()) { + if STOPWORDS.contains(lower.as_str()) { continue; } if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 { @@ -106,19 +122,6 @@ fn tokenize(raw: &str, year: Option) -> Vec { tokens } -fn stopwords() -> std::collections::HashSet<&'static str> { - [ - "1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip", - "bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip", - "remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3", - "proper", "repack", "limited", "extended", "uncut", "remastered", "subbed", - "subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd", - "hq", "cam", "ts", "dvdscr", "r5", "r6", - ] - .into_iter() - .collect() -} - #[cfg(test)] mod tests { use super::parse_filename;