diff --git a/src/parse.rs b/src/parse.rs index dba8459..dcd15da 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -40,8 +40,9 @@ pub fn parse_filename(path: &Path) -> FileHints { let year = extract_year(&stem); let cleaned = strip_bracketed(&stem); + let cleaned_for_tokens = strip_dash_suffix(&cleaned); let alt_titles = extract_alt_titles(&cleaned, year); - let tokens = tokenize(&cleaned, year); + let tokens = strip_noise_tokens(tokenize(&cleaned_for_tokens, year)); let title = if tokens.is_empty() { let mut fallback = cleaned.clone(); @@ -79,6 +80,32 @@ fn strip_bracketed(raw: &str) -> String { without_round.to_string() } +fn strip_dash_suffix(raw: &str) -> String { + let Some((left, right)) = raw.split_once(" - ") else { + return raw.to_string(); + }; + if should_strip_dash_suffix(right) { + left.trim().to_string() + } else { + raw.to_string() + } +} + +fn should_strip_dash_suffix(right: &str) -> bool { + let trimmed = right.trim(); + if trimmed.is_empty() { + return false; + } + let lower = trimmed.to_ascii_lowercase(); + if lower.contains("http://") || lower.contains("https://") || lower.contains("www.") { + return true; + } + if trimmed.contains('.') && !trimmed.contains(' ') { + return true; + } + false +} + fn extract_alt_titles(raw: &str, year: Option) -> Vec { let mut alt_titles = Vec::new(); if let Some((left, right)) = raw.split_once(" - ") { @@ -122,6 +149,37 @@ fn tokenize(raw: &str, year: Option) -> Vec { tokens } +fn strip_noise_tokens(mut tokens: Vec) -> Vec { + if tokens.is_empty() { + return tokens; + } + + if let Some(first) = tokens.first() { + let lower = first.to_ascii_lowercase(); + if matches!(lower.as_str(), "watch" | "download") { + tokens.remove(0); + } + } + + if tokens.len() >= 2 { + let last = tokens[tokens.len() - 1].to_ascii_lowercase(); + let prev = tokens[tokens.len() - 2].to_ascii_lowercase(); + if prev == "for" && last == "free" { + tokens.pop(); + tokens.pop(); + } + } + + if let Some(last) = tokens.last() { + let lower = last.to_ascii_lowercase(); + if matches!(lower.as_str(), "online" | "free" | "download") { + tokens.pop(); + } + } + + tokens +} + #[cfg(test)] mod tests { use super::parse_filename; @@ -177,4 +235,12 @@ mod tests { ); assert_eq!(hints.year, Some(2022)); } + + #[test] + fn strips_watch_for_free_domain_suffix() { + let path = Path::new("Watch VideoName 2025 HD for free - website.tld.mp4"); + let hints = parse_filename(path); + assert_eq!(hints.title.as_deref(), Some("VideoName")); + assert_eq!(hints.year, Some(2025)); + } }