use std::{fmt::Write, sync::Arc}; use axum::{ extract::{Query, State}, response::{Html, IntoResponse}, routing::get, Router, }; use commoncrawl_graph::{lookup::DomainView, measure_async}; use datadog_statsd::Client; use reqwest::StatusCode; use serde::Deserialize; use sqlx::{postgres::PgPoolOptions, Pool, Postgres}; struct AppStateInner { pool: Pool<Postgres>, statsd: Arc<Client>, } type AppState = Arc<AppStateInner>; #[derive(Deserialize)] struct RootQuery { #[serde(default)] k: String, } #[tokio::main] async fn main() -> anyhow::Result<()> { let pool = PgPoolOptions::new() .max_connections(48) .connect("postgres://localhost/commoncrawl_graph") .await?; let statsd = Arc::new(Client::new( "127.0.0.1:8125", "commoncrawl_graph_import", None, )?); let app = Router::new() .route("/", get(root)) .with_state(Arc::new(AppStateInner { pool, statsd })); let listener = tokio::net::TcpListener::bind("0.0.0.0:3120").await.unwrap(); axum::serve(listener, app).await.unwrap(); Ok(()) } async fn root(State(state): State<AppState>, Query(query): Query<RootQuery>) -> impl IntoResponse { match measure_async(&state.statsd, "root.latency", root_inner(&state, query.k)).await { Ok(x) => (StatusCode::OK, x), Err(e) => ( StatusCode::INTERNAL_SERVER_ERROR, Html::from(format!("Error: {e}")), ), } } async fn root_inner(state: &AppState, k: String) -> anyhow::Result<Html<String>> { let results = if k.is_empty() { "".to_string() } else { let d = DomainView::lookup(&state.pool, &k).await?; let inner_k = html_escape::encode_text(&d.query); let res = if d.is_empty() { "No results ):".to_string() } else { let from_table: String = d .links_from .into_iter() .fold(String::new(), |mut cur, (from, to)| { write!(&mut cur, r#"<tr><td><a href="/?k={from}">{from}</a></td><td><a href="/?k={to}">{to}</a></td></tr>"#).unwrap(); cur }) ; let to_table: String = d .links_to .into_iter() .fold(String::new(), |mut cur, (from, to)| { write!(&mut cur, r#"<tr><td><a href="/?k={from}">{from}</a></td><td><a href="/?k={to}">{to}</a></td></tr>"#).unwrap(); cur }); format!( r#"<div class="table-container"><div><table><tr><th>Source</th><th>Destination</th></tr>{to_table}</table></div><div><table><tr><th>Source</th><th>Destination</th></tr>{from_table}</table></div></div>"# ) }; format!("<h2>Results for {inner_k}:</h2>{res}") }; let inner_k = html_escape::encode_double_quoted_attribute(&k); let preamble = r#"<h1>Who's Linking to Me?</h1><p>This site uses <a href="https://commoncrawl.org/">Common Crawl</a> data to find all hosts that link to a site (and all sites linked to by that site). Wildcards are supported at the beginning of domain names, e.g. '*.scd31.com'. Only 1 000 maximum wildcard matches are shown, and a maximum of 10 000 edges (5 000 in either direction)."#; let html = format!( r#"<!DOCTYPE html><head><style>.table-container{{display: flex;justify-content: space-around;}}tr:nth-of-type(odd){{ background-color:#ccc;}}td{{padding:0px 8px;}}</style></head><body>{preamble}<form><label for="k">Domain: </label><input name="k" id="k" value="{inner_k}" /><input type="submit" value="Find" /></form><br>{results}</body></html>"# ); Ok(Html::from(html.to_string())) }