Newer
Older
use std::{fmt::Write, sync::Arc};
use axum::{
extract::{Query, State},
response::{Html, IntoResponse},
routing::get,
Router,
};
use commoncrawl_graph::{lookup::DomainView, measure_async};
use datadog_statsd::Client;
use reqwest::StatusCode;
use serde::Deserialize;
use sqlx::{postgres::PgPoolOptions, Pool, Postgres};
struct AppStateInner {
pool: Pool<Postgres>,
statsd: Arc<Client>,
}
type AppState = Arc<AppStateInner>;
#[derive(Deserialize)]
struct RootQuery {
#[serde(default)]
k: String,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let pool = PgPoolOptions::new()
.max_connections(48)
.connect("postgres://localhost/commoncrawl_graph")
.await?;
let statsd = Arc::new(Client::new(
"127.0.0.1:8125",
"commoncrawl_graph_import",
None,
)?);
let app = Router::new()
.route("/", get(root))
.with_state(Arc::new(AppStateInner { pool, statsd }));
let listener = tokio::net::TcpListener::bind("0.0.0.0:3120").await.unwrap();
axum::serve(listener, app).await.unwrap();
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
async fn root(State(state): State<AppState>, Query(query): Query<RootQuery>) -> impl IntoResponse {
match measure_async(&state.statsd, "root.latency", root_inner(&state, query.k)).await {
Ok(x) => (StatusCode::OK, x),
Err(e) => (
StatusCode::INTERNAL_SERVER_ERROR,
Html::from(format!("Error: {e}")),
),
}
}
async fn root_inner(state: &AppState, k: String) -> anyhow::Result<Html<String>> {
let results = if k.is_empty() {
"".to_string()
} else {
let d = DomainView::lookup(&state.pool, &k).await?;
let inner_k = html_escape::encode_text(&d.query);
let res = if d.is_empty() {
"No results ):".to_string()
} else {
let from_table: String = d
.links_from
.into_iter()
.fold(String::new(), |mut cur, (from, to)| {
write!(&mut cur, r#"<tr><td><a href="/?k={from}">{from}</a></td><td><a href="/?k={to}">{to}</a></td></tr>"#).unwrap();
cur
})
;
let to_table: String = d
.links_to
.into_iter()
.fold(String::new(), |mut cur, (from, to)| {
write!(&mut cur, r#"<tr><td><a href="/?k={from}">{from}</a></td><td><a href="/?k={to}">{to}</a></td></tr>"#).unwrap();
cur
});
format!(
r#"<div class="table-container"><div><table><tr><th>Source</th><th>Destination</th></tr>{to_table}</table></div><div><table><tr><th>Source</th><th>Destination</th></tr>{from_table}</table></div></div>"#
)
};
format!("<h2>Results for {inner_k}:</h2>{res}")
};
let inner_k = html_escape::encode_double_quoted_attribute(&k);
let preamble = r#"<h1>Who's Linking to Me?</h1><p>This site uses <a href="https://commoncrawl.org/">Common Crawl</a> data to find all hosts that link to a site (and all sites linked to by that site). Wildcards are supported at the beginning of domain names, e.g. '*.scd31.com'. Only 1 000 maximum wildcard matches are shown, and a maximum of 10 000 edges (5 000 in either direction)."#;
let html = format!(
r#"<!DOCTYPE html><head><style>.table-container{{display: flex;justify-content: space-around;}}tr:nth-of-type(odd){{ background-color:#ccc;}}td{{padding:0px 8px;}}</style></head><body>{preamble}<form><label for="k">Domain: </label><input name="k" id="k" value="{inner_k}" /><input type="submit" value="Find" /></form><br>{results}</body></html>"#
);
Ok(Html::from(html.to_string()))
}