From 299acbdb514e94f575362cafcd768e26337faf51 Mon Sep 17 00:00:00 2001 From: KHROTU Date: Wed, 10 Jun 2026 17:27:25 +0800 Subject: [PATCH] feat: Improve link matching added normalized_url, which does: - lowercase the scheme and host (e.g. HTTPS -> https, .COM -> .com) - strip www. - remove ports - remove fragment (e.g. /page#thing -> /page) - remove tracking params (e.g. ?utm_source=smth&share_id=xyz&id=abc -> ?id=abc) - strip index page, or at least the ones i could think of - remove trailing / - canonicalize percent encoding also did some perf work, tho it probs wont be very noticeable --- .gitignore | 1 + Cargo.lock | 1 + fmby_commands/src/fmby.rs | 53 ++-- fmby_core/Cargo.toml | 1 + fmby_core/src/utils/db.rs | 33 ++- fmby_core/src/utils/mod.rs | 1 + fmby_core/src/utils/normalized_url.rs | 256 ++++++++++++++++++ fmby_core/src/utils/url.rs | 11 +- fmby_entities/src/wiki_urls.rs | 1 + fmby_events/src/channels/global.rs | 2 + fmby_migrations/src/lib.rs | 6 +- .../m20220101_000002_add_normalized_url.rs | 53 ++++ 12 files changed, 397 insertions(+), 22 deletions(-) create mode 100644 fmby_core/src/utils/normalized_url.rs create mode 100644 fmby_migrations/src/m20220101_000002_add_normalized_url.rs diff --git a/.gitignore b/.gitignore index 0b745e2..b0286a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target +.cargo .env \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index eb80ded..967b5de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -967,6 +967,7 @@ dependencies = [ "serde_json", "tokio", "tracing", + "url", ] [[package]] diff --git a/fmby_commands/src/fmby.rs b/fmby_commands/src/fmby.rs index 8a65c63..8795603 100644 --- a/fmby_commands/src/fmby.rs +++ b/fmby_commands/src/fmby.rs @@ -1,8 +1,9 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use fmby_core::constants::{FMHY_SINGLE_PAGE_ENDPOINT, FmhyChannel}; use fmby_core::utils::db::{ChunkSize, infer_wiki_url_status}; use fmby_core::utils::message::get_content_or_referenced; +use fmby_core::utils::normalized_url::normalize_url; use fmby_core::utils::url::{clean_url, extract_urls}; use fmby_core::utils::wiki::collect_wiki_urls; use fmby_entities::sea_orm_active_enums::WikiUrlStatus; @@ -16,7 +17,9 @@ use poise::serenity_prelude::{ use sea_orm::sea_query::OnConflict; use sea_orm::sea_query::extension::postgres::PgExpr; use sea_orm::sqlx::types::chrono::Utc; -use sea_orm::{ActiveValue::*, QueryOrder, QuerySelect, QueryTrait, TransactionTrait, prelude::*}; +use sea_orm::{ + ActiveValue::*, Condition, QueryOrder, QuerySelect, QueryTrait, TransactionTrait, prelude::*, +}; use crate::{Command, Context, Error}; @@ -114,6 +117,7 @@ pub async fn migrate( .iter() .map(|url| clean_url(url).to_owned()) .collect::>(); + let wiki_normalized: HashSet = urls.iter().filter_map(|u| normalize_url(u)).collect(); drop(content); let mut messages_processed = 0u32; let mut messages_skipped = 0u32; @@ -160,10 +164,14 @@ pub async fn migrate( let urls = match status { WikiUrlStatus::Pending => m_urls, WikiUrlStatus::Added => { - let urls_in_wiki = m_urls + let urls_in_wiki: Vec<_> = m_urls .into_iter() - .filter(|url| urls.contains(url)) - .collect::>(); + .filter(|url| { + normalize_url(url) + .as_ref() + .map_or(false, |n| wiki_normalized.contains(n)) + }) + .collect(); if urls_in_wiki.is_empty() { continue; @@ -172,10 +180,14 @@ pub async fn migrate( urls_in_wiki } WikiUrlStatus::Removed => { - let urls_not_in_wiki = m_urls + let urls_not_in_wiki: Vec<_> = m_urls .into_iter() - .filter(|url| !urls.contains(url)) - .collect::>(); + .filter(|url| { + normalize_url(url) + .as_ref() + .map_or(true, |n| !wiki_normalized.contains(n)) + }) + .collect(); if urls_not_in_wiki.is_empty() { continue; @@ -191,6 +203,7 @@ pub async fn migrate( entries .entry(url.clone()) .or_insert_with(|| wiki_urls::ActiveModel { + normalized_url: Set(normalize_url(&url)), url: Set(url), user_id: Set(Some(message.author.id.get() as i64)), message_id: Set(Some(message.id.get() as i64)), @@ -254,6 +267,7 @@ pub async fn migrate( entries .entry(url.clone()) .or_insert_with(|| wiki_urls::ActiveModel { + normalized_url: Set(normalize_url(&url)), url: Set(url), guild_id: Set(ctx.guild_id().map(|g| g.get() as i64)), created_at: Set(Utc::now().into()), @@ -363,11 +377,16 @@ pub async fn context( url: String, #[description = "Whether the response should only be visible to you"] ephemeral: Option, ) -> Result<(), Error> { - if let Some(entry) = WikiUrls::find() - .filter(wiki_urls::Column::Url.eq(url)) + let normalized = normalize_url(&url); + let entry = WikiUrls::find() + .filter( + Condition::any() + .add(wiki_urls::Column::Url.eq(&url)) + .add(wiki_urls::Column::NormalizedUrl.eq(normalized.as_deref().unwrap_or(&url))), + ) .one(&ctx.data().database.pool) - .await? - { + .await?; + if let Some(entry) = entry { let context = match (entry.guild_id, entry.channel_id, entry.message_id) { (Some(guild_id), Some(channel_id), Some(message_id)) => { format!( @@ -448,16 +467,18 @@ pub async fn inconsistencies(ctx: Context<'_>) -> Result<(), Error> { .text() .await?; - let urls = collect_wiki_urls(&content) + let urls: HashSet = collect_wiki_urls(&content) .iter() - .map(|url| clean_url(url).to_owned()) - .collect::>(); + .filter_map(|url| normalize_url(url)) + .collect(); let mut added_not_in_wiki = Vec::new(); let mut in_wiki_not_added = Vec::new(); for entry in entries { - let in_wiki = urls.contains(&entry.url); + let in_wiki = normalize_url(&entry.url) + .as_ref() + .map_or(false, |n| urls.contains(n)); match entry.status { WikiUrlStatus::Added => { diff --git a/fmby_core/Cargo.toml b/fmby_core/Cargo.toml index 5ada3fb..812ac79 100644 --- a/fmby_core/Cargo.toml +++ b/fmby_core/Cargo.toml @@ -18,3 +18,4 @@ serde.workspace = true serde_json.workspace = true tokio.workspace = true tracing.workspace = true +url.workspace = true diff --git a/fmby_core/src/utils/db.rs b/fmby_core/src/utils/db.rs index 3540440..546e447 100644 --- a/fmby_core/src/utils/db.rs +++ b/fmby_core/src/utils/db.rs @@ -1,10 +1,13 @@ +use std::collections::HashSet; + use fmby_entities::sea_orm_active_enums::WikiUrlStatus; use fmby_entities::{prelude::*, wiki_urls}; use poise::serenity_prelude::Message; use sea_orm::sqlx::types::chrono::Utc; -use sea_orm::{ActiveValue::*, IntoActiveModel, Iterable, prelude::*}; +use sea_orm::{ActiveValue::*, Condition, IntoActiveModel, Iterable, prelude::*}; use crate::constants::FmhyChannel; +use crate::utils::normalized_url::normalize_url; pub trait ChunkSize { fn chunk_size() -> usize; @@ -37,6 +40,7 @@ pub fn infer_wiki_url_status(channel_id: u64) -> Option { } } +// We now check against normalized_url as well pub async fn get_wiki_urls_by_urls( urls: &[String], pool: &DatabaseConnection, @@ -45,11 +49,29 @@ pub async fn get_wiki_urls_by_urls( return None; } - WikiUrls::find() - .filter(wiki_urls::Column::Url.is_in(urls)) + let normalized_urls: Vec = urls.iter().filter_map(|u| normalize_url(u)).collect(); + if normalized_urls.is_empty() && urls.is_empty() { + return None; + } + let mut condition = Condition::any(); + if !normalized_urls.is_empty() { + condition = condition.add(wiki_urls::Column::NormalizedUrl.is_in(normalized_urls.clone())); + } + condition = condition.add(wiki_urls::Column::Url.is_in(urls.to_vec())); + let results = WikiUrls::find() + .filter(condition) .all(pool) .await .ok() + .unwrap_or_default(); + let mut seen = HashSet::new(); + let deduped: Vec = + results.into_iter().filter(|e| seen.insert(e.id)).collect(); + if deduped.is_empty() { + None + } else { + Some(deduped) + } } pub async fn update_wiki_urls_with_message( @@ -64,6 +86,11 @@ pub async fn update_wiki_urls_with_message( entry.channel_id = Set(Some(message.channel_id.get() as i64)); entry.updated_at = Set(Utc::now().into()); entry.status = Set(status); + if entry.normalized_url.as_ref().is_none() { + if let Some(norm) = normalize_url(entry.url.as_ref()) { + entry.normalized_url = Set(Some(norm)); + } + } let _ = entry.update(pool).await; } diff --git a/fmby_core/src/utils/mod.rs b/fmby_core/src/utils/mod.rs index 04fa250..8d08a16 100644 --- a/fmby_core/src/utils/mod.rs +++ b/fmby_core/src/utils/mod.rs @@ -1,5 +1,6 @@ pub mod db; pub mod formatters; pub mod message; +pub mod normalized_url; pub mod url; pub mod wiki; diff --git a/fmby_core/src/utils/normalized_url.rs b/fmby_core/src/utils/normalized_url.rs new file mode 100644 index 0000000..8aa98b2 --- /dev/null +++ b/fmby_core/src/utils/normalized_url.rs @@ -0,0 +1,256 @@ +use std::collections::HashSet; +use std::sync::LazyLock; + +use url::Url; + +/// yippee gemini ftw because i couldnt find a list online, need to +/// verify/update in the future when we have higher standards +static TRACKING_PARAMS: LazyLock> = LazyLock::new(|| { + HashSet::from([ + // Google Analytics + "utm_source", + "utm_medium", + "utm_campaign", + "utm_term", + "utm_content", + "utm_id", + "utm_reader", + "gclid", + "gclsrc", + "dclid", + "gbraid", + "wbraid", + // Facebook + "fbclid", + "fb_action_ids", + "fb_action_types", + "fb_ref", + "fb_source", + "ref", + "ref_src", + "ref_url", + // Twitter / X + "twclid", + "_twitter_sess_id", + // Reddit + "correlation_id", + "share_id", + // Microsoft / Bing + "msclkid", + "mkt_tok", + // TikTok + "ttclid", + // HubSpot + "hsa_cam", + "hsa_grp", + "hsa_mt", + "hsa_src", + "hsa_ad", + "hsa_acc", + "hsa_net", + "hsa_ver", + "hsa_ol", + "hsa_kw", + "_hsenc", + "_hsmi", + // Mailchimp + "mc_cid", + "mc_eid", + // Other + "sc_campaign", + "sc_channel", + "sc_content", + "sc_medium", + "sc_outcome", + "sc_geo", + "sc_country", + "zanpid", + "igshid", + "igsh", + "epik", + "affiliate_id", + "aff", + "aff_id", + "partner_id", + "campaign_id", + "campaign", + "source", + "sourceid", + "cmpid", + "cid", + // Generic tracking + "trk", + "trkCampaign", + "CMP", + "CMPID", + "spm", + "spm_id", + "from", + "from_source", + "tracking_source", + "itm_source", + "itm_campaign", + "itm_term", + "itm_content", + ]) +}); + +pub fn normalize_url(raw: &str) -> Option { + let trimmed = raw.trim(); + let mut url = Url::parse(trimmed).ok()?; + let scheme = url.scheme().to_lowercase(); + url.set_scheme(&scheme).map_err(|_| ()).ok()?; + if let Some(host) = url.host_str().map(|h| h.to_lowercase()) { + let host = host.strip_prefix("www.").unwrap_or(&host); + url.set_host(Some(&host)).map_err(|_| ()).ok()?; + } + let port = url.port(); + if port == Some(80) && url.scheme() == "http" { + url.set_port(None).map_err(|_| ()).ok()?; + } + if port == Some(443) && url.scheme() == "https" { + url.set_port(None).map_err(|_| ()).ok()?; + } + url.set_fragment(None); + { + let mut pairs: Vec<_> = url + .query_pairs() + .filter(|(k, _)| !TRACKING_PARAMS.contains(k.as_ref())) + .collect(); + pairs.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1))); + if pairs.is_empty() { + url.set_query(None); + } else { + let query: String = pairs + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join("&"); + url.set_query(Some(&query)); + } + } + let path = url.path().to_owned(); + let path = path + .trim_end_matches('/') + .trim_end_matches("/index.html") + .trim_end_matches("/index.htm") + .trim_end_matches("/index.php") + .trim_end_matches("/index.asp") + .trim_end_matches("/index.aspx") + .trim_end_matches("/default.asp") + .trim_end_matches("/default.aspx") + .trim_end_matches("/Default.aspx"); + let path = if path.is_empty() { "/" } else { path }; + url.set_path(path); + let result = url.to_string(); + let result = result + .trim_start_matches("https://") + .trim_start_matches("http://"); + Some(percent_encode_uppercase(result)) +} + +fn percent_encode_uppercase(input: &str) -> String { + let mut output = String::with_capacity(input.len()); + let mut chars = input.chars(); + while let Some(c) = chars.next() { + if c == '%' { + let hex1 = chars.next().map(|h| h.to_ascii_uppercase()); + let hex2 = chars.next().map(|h| h.to_ascii_uppercase()); + match (hex1, hex2) { + (Some(h1), Some(h2)) if h1.is_ascii_hexdigit() && h2.is_ascii_hexdigit() => { + output.push('%'); + output.push(h1); + output.push(h2); + } + (Some(h1), Some(h2)) => { + output.push('%'); + output.push(h1); + output.push(h2); + } + (Some(h1), None) => { + output.push('%'); + output.push(h1); + } + (None, _) => { + output.push('%'); + } + } + } else { + output.push(c); + } + } + output +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_normalize_basic() { + assert_eq!( + normalize_url("https://www.Example.com/").unwrap(), + "example.com/" + ); + assert_eq!( + normalize_url("http://example.com:80/page").unwrap(), + "example.com/page" + ); + assert_eq!( + normalize_url("https://example.com:443/page").unwrap(), + "example.com/page" + ); + } + #[test] + fn test_normalize_tracking_params() { + let result = + normalize_url("https://example.com/page?utm_source=twitter&ref=abc&good=keep").unwrap(); + assert_eq!(result, "example.com/page?good=keep"); + } + #[test] + fn test_normalize_fragment() { + let result = normalize_url("https://example.com/page#section").unwrap(); + assert_eq!(result, "example.com/page"); + } + #[test] + fn test_normalize_query_sorting() { + let a = normalize_url("https://example.com?b=2&a=1").unwrap(); + let b = normalize_url("https://example.com?a=1&b=2").unwrap(); + assert_eq!(a, b); + } + #[test] + fn test_normalize_index_pages() { + assert_eq!( + normalize_url("https://example.com/index.html").unwrap(), + "example.com/" + ); + assert_eq!( + normalize_url("https://example.com/subdir/index.php").unwrap(), + "example.com/subdir/" + ); + } + #[test] + fn test_normalize_www_strip() { + assert_eq!( + normalize_url("https://www.example.com/page").unwrap(), + "example.com/page" + ); + assert_eq!( + normalize_url("https://example.com/page").unwrap(), + "example.com/page" + ); + } + #[test] + fn test_normalize_percent_encoding() { + let a = normalize_url("https://example.com/path%2fpage").unwrap(); + let b = normalize_url("https://example.com/path%2Fpage").unwrap(); + assert_eq!(a, b); + } + #[test] + fn test_all_tracking_params_stripped_combined() { + let result = normalize_url( + "https://www.Example.com:443/page/?utm_source=twitter&fbclid=123&utm_medium=cpc#section", + ) + .unwrap(); + assert_eq!(result, "example.com/page"); + } +} \ No newline at end of file diff --git a/fmby_core/src/utils/url.rs b/fmby_core/src/utils/url.rs index 70177d1..6e38e73 100644 --- a/fmby_core/src/utils/url.rs +++ b/fmby_core/src/utils/url.rs @@ -2,13 +2,20 @@ use std::sync::LazyLock; use regex::Regex; -pub fn clean_url(url: &str) -> &str { +use super::normalized_url::normalize_url; + +pub fn clean_url(url: &str) -> String { + if let Some(normalized) = normalize_url(url) { + return normalized; + } url.trim() .trim_start_matches("https://") .trim_start_matches("http://") .trim_start_matches("www.") .trim_end_matches("?tab=readme-ov-file") .trim_end_matches('/') + .to_lowercase() + .to_owned() } static URL_RE: LazyLock = LazyLock::new(|| { @@ -18,7 +25,7 @@ static URL_RE: LazyLock = LazyLock::new(|| { pub fn extract_urls(haystack: &str) -> Option> { let matches: Vec = URL_RE .find_iter(haystack) - .map(|m| clean_url(m.as_str()).to_owned()) + .map(|m| clean_url(m.as_str())) .filter(|s| !s.starts_with("discord.com/channels") && !s.starts_with("fmhy.net")) .collect(); diff --git a/fmby_entities/src/wiki_urls.rs b/fmby_entities/src/wiki_urls.rs index 2fc9fc8..a247f1f 100644 --- a/fmby_entities/src/wiki_urls.rs +++ b/fmby_entities/src/wiki_urls.rs @@ -10,6 +10,7 @@ pub struct Model { pub id: i32, #[sea_orm(column_type = "Text", unique)] pub url: String, + pub normalized_url: Option, pub channel_id: Option, pub user_id: Option, pub message_id: Option, diff --git a/fmby_events/src/channels/global.rs b/fmby_events/src/channels/global.rs index 82bb6ca..0bb1c1e 100644 --- a/fmby_events/src/channels/global.rs +++ b/fmby_events/src/channels/global.rs @@ -7,6 +7,7 @@ use fmby_core::utils::db::{ }; use fmby_core::utils::formatters::UrlFormatter; use fmby_core::utils::message::get_content_or_referenced; +use fmby_core::utils::normalized_url::normalize_url; use fmby_core::utils::url::extract_urls; use fmby_entities::sea_orm_active_enums::WikiUrlStatus; use fmby_entities::{prelude::*, wiki_urls}; @@ -119,6 +120,7 @@ pub async fn on_message(ctx: &Context, message: &Message) { } } else if let Some(status) = status { let _ = WikiUrls::insert_many(urls.into_iter().map(|url| wiki_urls::ActiveModel { + normalized_url: Set(normalize_url(&url)), url: Set(url), user_id: Set(Some(message.author.id.get() as i64)), guild_id: Set(message.guild_id.map(|g| g.get() as i64)), diff --git a/fmby_migrations/src/lib.rs b/fmby_migrations/src/lib.rs index daef9d3..ac41a49 100644 --- a/fmby_migrations/src/lib.rs +++ b/fmby_migrations/src/lib.rs @@ -1,4 +1,5 @@ mod m20220101_000001_create_table; +mod m20220101_000002_add_normalized_url; use async_trait::async_trait; pub use sea_orm_migration::prelude::*; @@ -8,6 +9,9 @@ pub struct Migrator; #[async_trait] impl MigratorTrait for Migrator { fn migrations() -> Vec> { - vec![Box::new(m20220101_000001_create_table::Migration)] + vec![ + Box::new(m20220101_000001_create_table::Migration), + Box::new(m20220101_000002_add_normalized_url::Migration), + ] } } diff --git a/fmby_migrations/src/m20220101_000002_add_normalized_url.rs b/fmby_migrations/src/m20220101_000002_add_normalized_url.rs new file mode 100644 index 0000000..aa2a8cb --- /dev/null +++ b/fmby_migrations/src/m20220101_000002_add_normalized_url.rs @@ -0,0 +1,53 @@ +use async_trait::async_trait; +use sea_orm_migration::{prelude::*, schema::*}; + +use crate::m20220101_000001_create_table::WikiUrls; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +const IDX_WIKI_URLS_NORMALIZED_URL: &str = "idx_wiki_urls_normalized_url"; + +#[async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .alter_table( + Table::alter() + .table(WikiUrls::Table) + .add_column_if_not_exists(text_null(NormalizedUrl::Column)) + .to_owned(), + ) + .await?; + manager + .create_index( + Index::create() + .name(IDX_WIKI_URLS_NORMALIZED_URL) + .table(WikiUrls::Table) + .col(NormalizedUrl::Column) + .to_owned(), + ) + .await?; + Ok(()) + } + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .drop_index(Index::drop().name(IDX_WIKI_URLS_NORMALIZED_URL).to_owned()) + .await?; + manager + .alter_table( + Table::alter() + .table(WikiUrls::Table) + .drop_column(NormalizedUrl::Column) + .to_owned(), + ) + .await?; + Ok(()) + } +} + +#[derive(DeriveIden)] +enum NormalizedUrl { + #[sea_orm(iden = "normalized_url")] + Column, +} \ No newline at end of file