Optimize filename parsing
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1348,6 +1348,7 @@ dependencies = [
|
|||||||
"is-terminal",
|
"is-terminal",
|
||||||
"libc",
|
"libc",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
|
"once_cell",
|
||||||
"owo-colors",
|
"owo-colors",
|
||||||
"predicates",
|
"predicates",
|
||||||
"rayon",
|
"rayon",
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ is-terminal = "0.4"
|
|||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
num_cpus = "1.16"
|
num_cpus = "1.16"
|
||||||
owo-colors = "4.1"
|
owo-colors = "4.1"
|
||||||
|
once_cell = "1.19"
|
||||||
rayon = "1.10"
|
rayon = "1.10"
|
||||||
regex = "1.10"
|
regex = "1.10"
|
||||||
reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] }
|
reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] }
|
||||||
|
|||||||
45
src/parse.rs
45
src/parse.rs
@@ -1,5 +1,7 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
|
||||||
use crate::utils::{collapse_whitespace, normalize_title};
|
use crate::utils::{collapse_whitespace, normalize_title};
|
||||||
@@ -12,6 +14,24 @@ pub struct FileHints {
|
|||||||
pub alt_titles: Vec<String>,
|
pub alt_titles: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static YEAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(19|20)\d{2}").expect("year regex"));
|
||||||
|
static BRACKET_SQUARE_RE: Lazy<Regex> =
|
||||||
|
Lazy::new(|| Regex::new(r"\[[^\]]*\]").expect("square bracket regex"));
|
||||||
|
static BRACKET_ROUND_RE: Lazy<Regex> =
|
||||||
|
Lazy::new(|| Regex::new(r"\([^\)]*\)").expect("round bracket regex"));
|
||||||
|
static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||||
|
[
|
||||||
|
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
||||||
|
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
||||||
|
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
||||||
|
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
||||||
|
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
||||||
|
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
});
|
||||||
|
|
||||||
pub fn parse_filename(path: &Path) -> FileHints {
|
pub fn parse_filename(path: &Path) -> FileHints {
|
||||||
let stem = path
|
let stem = path
|
||||||
.file_stem()
|
.file_stem()
|
||||||
@@ -44,9 +64,8 @@ pub fn parse_filename(path: &Path) -> FileHints {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn extract_year(raw: &str) -> Option<i32> {
|
fn extract_year(raw: &str) -> Option<i32> {
|
||||||
let re = Regex::new(r"(19|20)\d{2}").ok()?;
|
|
||||||
let mut year: Option<i32> = None;
|
let mut year: Option<i32> = None;
|
||||||
for mat in re.find_iter(raw) {
|
for mat in YEAR_RE.find_iter(raw) {
|
||||||
if let Ok(parsed) = mat.as_str().parse::<i32>() {
|
if let Ok(parsed) = mat.as_str().parse::<i32>() {
|
||||||
year = Some(parsed);
|
year = Some(parsed);
|
||||||
}
|
}
|
||||||
@@ -55,10 +74,8 @@ fn extract_year(raw: &str) -> Option<i32> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn strip_bracketed(raw: &str) -> String {
|
fn strip_bracketed(raw: &str) -> String {
|
||||||
let re_square = Regex::new(r"\[[^\]]*\]").unwrap();
|
let without_square = BRACKET_SQUARE_RE.replace_all(raw, " ");
|
||||||
let re_round = Regex::new(r"\([^\)]*\)").unwrap();
|
let without_round = BRACKET_ROUND_RE.replace_all(&without_square, " ");
|
||||||
let without_square = re_square.replace_all(raw, " ");
|
|
||||||
let without_round = re_round.replace_all(&without_square, " ");
|
|
||||||
without_round.to_string()
|
without_round.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,7 +100,6 @@ fn clean_title_fragment(fragment: &str, year: Option<i32>) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
||||||
let stopwords = stopwords();
|
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
for token in raw.split(|c: char| !c.is_alphanumeric()) {
|
for token in raw.split(|c: char| !c.is_alphanumeric()) {
|
||||||
if token.is_empty() {
|
if token.is_empty() {
|
||||||
@@ -95,7 +111,7 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if stopwords.contains(lower.as_str()) {
|
if STOPWORDS.contains(lower.as_str()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
|
if token.chars().all(|c| c.is_ascii_uppercase()) && token.len() <= 8 {
|
||||||
@@ -106,19 +122,6 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
|
|||||||
tokens
|
tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stopwords() -> std::collections::HashSet<&'static str> {
|
|
||||||
[
|
|
||||||
"1080p", "720p", "2160p", "480p", "360p", "4k", "uhd", "hdr", "dvdrip",
|
|
||||||
"bdrip", "brrip", "bluray", "blu", "webdl", "web-dl", "webrip", "hdrip",
|
|
||||||
"remux", "x264", "x265", "h264", "h265", "hevc", "aac", "dts", "ac3",
|
|
||||||
"proper", "repack", "limited", "extended", "uncut", "remastered", "subbed",
|
|
||||||
"subs", "multi", "dubbed", "dub", "yts", "yify", "rarbg", "web", "hd",
|
|
||||||
"hq", "cam", "ts", "dvdscr", "r5", "r6",
|
|
||||||
]
|
|
||||||
.into_iter()
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::parse_filename;
|
use super::parse_filename;
|
||||||
|
|||||||
Reference in New Issue
Block a user