Improve filename parsing for spammy suffixes

This commit is contained in:
2025-12-30 18:22:54 -05:00
parent 3554adf7e4
commit 8bf98e057c

View File

@@ -40,8 +40,9 @@ pub fn parse_filename(path: &Path) -> FileHints {
let year = extract_year(&stem);
let cleaned = strip_bracketed(&stem);
let cleaned_for_tokens = strip_dash_suffix(&cleaned);
let alt_titles = extract_alt_titles(&cleaned, year);
let tokens = tokenize(&cleaned, year);
let tokens = strip_noise_tokens(tokenize(&cleaned_for_tokens, year));
let title = if tokens.is_empty() {
let mut fallback = cleaned.clone();
@@ -79,6 +80,32 @@ fn strip_bracketed(raw: &str) -> String {
without_round.to_string()
}
fn strip_dash_suffix(raw: &str) -> String {
let Some((left, right)) = raw.split_once(" - ") else {
return raw.to_string();
};
if should_strip_dash_suffix(right) {
left.trim().to_string()
} else {
raw.to_string()
}
}
fn should_strip_dash_suffix(right: &str) -> bool {
let trimmed = right.trim();
if trimmed.is_empty() {
return false;
}
let lower = trimmed.to_ascii_lowercase();
if lower.contains("http://") || lower.contains("https://") || lower.contains("www.") {
return true;
}
if trimmed.contains('.') && !trimmed.contains(' ') {
return true;
}
false
}
fn extract_alt_titles(raw: &str, year: Option<i32>) -> Vec<String> {
let mut alt_titles = Vec::new();
if let Some((left, right)) = raw.split_once(" - ") {
@@ -122,6 +149,37 @@ fn tokenize(raw: &str, year: Option<i32>) -> Vec<String> {
tokens
}
fn strip_noise_tokens(mut tokens: Vec<String>) -> Vec<String> {
if tokens.is_empty() {
return tokens;
}
if let Some(first) = tokens.first() {
let lower = first.to_ascii_lowercase();
if matches!(lower.as_str(), "watch" | "download") {
tokens.remove(0);
}
}
if tokens.len() >= 2 {
let last = tokens[tokens.len() - 1].to_ascii_lowercase();
let prev = tokens[tokens.len() - 2].to_ascii_lowercase();
if prev == "for" && last == "free" {
tokens.pop();
tokens.pop();
}
}
if let Some(last) = tokens.last() {
let lower = last.to_ascii_lowercase();
if matches!(lower.as_str(), "online" | "free" | "download") {
tokens.pop();
}
}
tokens
}
#[cfg(test)]
mod tests {
use super::parse_filename;
@@ -177,4 +235,12 @@ mod tests {
);
assert_eq!(hints.year, Some(2022));
}
#[test]
fn strips_watch_for_free_domain_suffix() {
let path = Path::new("Watch VideoName 2025 HD for free - website.tld.mp4");
let hints = parse_filename(path);
assert_eq!(hints.title.as_deref(), Some("VideoName"));
assert_eq!(hints.year, Some(2025));
}
}