Optimize filename parsing
This commit is contained in:
45
src/parse.rs
45
src/parse.rs
@@ -1,5 +1,7 @@
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::utils::{collapse_whitespace, normalize_title};
|
||||
@@ -12,6 +14,24 @@ pub struct FileHints {
|
||||
pub alt_titles: Vec<String>,
|
||||
}
|
||||
|
||||
static YEAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(19|20)\d{2}").expect("year regex"));
|
||||
static BRACKET_SQUARE_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"\[[^\]]*\]").expect("square bracket regex"));
|
||||
static BRACKET_ROUND_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"\([^\)]*\)").expect("round bracket regex"));
|
||||
static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||
[
|
||||
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
||||
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
||||
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
||||
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
||||
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
||||
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
});
|
||||
|
||||
pub fn parse_filename(path: &Path) -> FileHints {
|
||||
let stem = path
|
||||
.file_stem()
|
||||
@@ -44,9 +64,8 @@ pub fn parse_filename(path: &Path) -> FileHints {
|
||||
}
|
||||
|
||||
fn extract_year(raw: &str) -> Option<i32> {
|
||||
let re = Regex::new(r"(19|20)\d{2}").ok()?;
|
||||
let mut year: Option<i32> = None;
|
||||
for mat in re.find_iter(raw) {
|
||||
for mat in YEAR_RE.find_iter(raw) {
|
||||
if let Ok(parsed) = mat.as_str().parse::<i32>() {
|
||||
year = Some(parsed);
|
||||
}
|
||||
@@ -55,10 +74,8 @@ fn extract_year(raw: &str) -> Option<i32> {
|
||||
}
|
||||
|
||||
fn strip_bracketed(raw: &str) -> String {
|
||||
let re_square = Regex::new(r"\[[^\]]*\]").unwrap();
|
||||
let re_round = Regex::new(r"\([^\)]*\)").unwrap();
|
||||
let without_square = re_square.replace_all(raw, " ");
|
||||
let without_round = re_round.replace_all(&without_square, " ");
|
||||
let without_square = BRACKET_SQUARE_RE.replace_all(raw, " ");
|
||||
let without_round = BRACKET_ROUND_RE.replace_all(&without_square, " ");
|
||||
without_round.to_string()
|
||||
}
|
||||
|
||||
@@ -83,7 +100,6 @@ fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
|
||||
}
|
||||
|
||||
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||
let stopwords = stopwords();
|
||||
let mut tokens = Vec::new();
|
||||
for token in raw.split(|c: char| !c.is_alphanumeric()) {
|
||||
if token.is_empty() {
|
||||
@@ -95,7 +111,7 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if stopwords.contains(lower.as_str()) {
|
||||
if STOPWORDS.contains(lower.as_str()) {
|
||||
continue;
|
||||
}
|
||||
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
|
||||
@@ -106,19 +122,6 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||
tokens
|
||||
}
|
||||
|
||||
fn stopwords() -> std::collections::HashSet<&'static str> {
|
||||
[
|
||||
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
||||
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
||||
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
||||
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
||||
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
||||
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::parse_filename;
|
||||
|
||||
Reference in New Issue
Block a user