manta_server/server/
mod.rs

1//! Axum HTTP/HTTPS server setup.
2//!
3//! - [`ServerState`] — shared application state passed through every
4//!   handler via Axum's `State<Arc<ServerState>>` extractor. Holds one
5//!   [`SiteBackend`] per configured site so a single server can fan
6//!   out to multiple CSM/OpenCHAMI clusters.
7//! - [`start_server`] — binary entry point. Builds the router (see
8//!   [`routes::build_router`]), installs the request-logging
9//!   middleware, optionally wraps the listener in TLS, and installs a
10//!   SIGTERM/Ctrl+C handler for graceful shutdown.
11//! - Submodules:
12//!   - [`handlers`] — per-resource Axum handlers; converts HTTP
13//!     requests into service-layer calls.
14//!   - [`routes`] — router registration (one entry per `/api/v1`
15//!     path).
16//!   - [`auth_middleware`] — defensive middleware applied to
17//!     `/api/v1/auth/*` (per-IP rate limit + body redaction).
18//!   - [`common`] — server-only helpers (per-request `InfraContext`,
19//!     Kafka audit producer, JWT claim extractors, Vault client).
20//!   - [`api_doc`] — utoipa OpenAPI document served at
21//!     `GET /openapi.json` + `GET /docs`.
22
23pub mod api_doc;
24pub mod auth_middleware;
25pub mod common;
26pub mod handlers;
27pub mod routes;
28
29use std::collections::HashMap;
30use std::net::SocketAddr;
31use std::sync::Arc;
32
33use axum_server::tls_rustls::RustlsConfig;
34use manta_backend_dispatcher::error::Error;
35use std::time::Duration;
36
37use crate::dispatcher::StaticBackendDispatcher;
38use crate::server::common::app_context::InfraContext;
39use crate::server::common::kafka::Kafka;
40
41/// All per-site connection data the server needs to talk to backend APIs.
42///
43/// Built once at startup from a `[sites.X]` block in `server.toml`,
44/// then owned by [`ServerState::sites`] inside a `HashMap` keyed by
45/// the site name. The matching `[sites.X]` block is selected per
46/// request from the `X-Manta-Site` header.
47///
48/// Borrowed per request as an [`common::app_context::InfraContext`]
49/// via [`ServerState::infra_context`] so the service layer can pass
50/// the per-site bundle around without taking ownership.
51pub struct SiteBackend {
52  /// Dispatches API calls to the configured CSM or OpenCHAMI backend.
53  pub backend: StaticBackendDispatcher,
54  /// Base URL for the CSM/OpenCHAMI API (e.g. `https://api.cluster/apis`).
55  pub shasta_base_url: String,
56  /// PEM-encoded root CA certificate for the backend; empty vec skips verification.
57  pub shasta_root_cert: Vec<u8>,
58  /// SOCKS5 proxy URL; `None` means direct connections.
59  pub socks5_proxy: Option<String>,
60  /// HashiCorp Vault base URL; `None` means features requiring vault return 501.
61  pub vault_base_url: Option<String>,
62  /// Gitea VCS base URL derived from the site base URL.
63  pub gitea_base_url: String,
64  /// Kubernetes API URL; `None` means console and log-streaming endpoints return 501.
65  pub k8s_api_url: Option<String>,
66}
67
68/// Shared state for all HTTP handlers.
69///
70/// Holds one [`SiteBackend`] per configured site so a single server
71/// can serve multiple clusters. Each request supplies the target site
72/// via the `X-Manta-Site` header; handlers call
73/// [`ServerState::infra_context`] (or, via the
74/// [`handlers::RequestCtx`] extractor, the cached
75/// `RequestCtx::infra()` shortcut) to retrieve the per-site data.
76///
77/// Plumbed through Axum's `State<Arc<ServerState>>` extractor. Owned
78/// by [`start_server`] and cloned (cheaply, since it's an `Arc`) into
79/// every spawned task.
80pub struct ServerState {
81  /// Per-site connection data, keyed by site name.
82  pub sites: HashMap<String, SiteBackend>,
83  /// How long a WebSocket console session may be idle before the server
84  /// closes it.  Protects against leaked Kubernetes pod attachments.
85  pub console_inactivity_timeout: Duration,
86  /// Kafka producer for security/audit events (currently used only by
87  /// `/api/v1/auth/*`). `None` disables audit emission.
88  pub auditor: Option<Kafka>,
89  /// Per-source-IP rate limit on `/api/v1/auth/*` (requests/minute).
90  /// `None` disables in-process rate limiting.
91  pub auth_rate_limit_per_minute: Option<u32>,
92  /// Global request timeout applied to every HTTP route (router-level
93  /// `TimeoutLayer`). All long-running work (power transitions, SAT
94  /// dispatch) runs CLI-side, so this is the only request-timeout
95  /// knob the server has.
96  pub request_timeout: Duration,
97  /// Drain window for `axum_server::Handle::graceful_shutdown` on
98  /// SIGTERM / Ctrl+C. Sourced from
99  /// `server.toml`'s `[server] shutdown_grace_period_secs`.
100  pub shutdown_grace_period: Duration,
101  /// Filesystem root that confines `POST /migrate/{backup,restore}`
102  /// file access. `None` disables both endpoints — even admin callers
103  /// must wait for an operator to opt in via `[server]
104  /// migrate_backup_root`. The path is stored already-canonicalised
105  /// so per-request validation is a single `starts_with` against this.
106  pub migrate_backup_root: Option<std::path::PathBuf>,
107}
108
109impl ServerState {
110  /// Build a borrowed [`InfraContext`] for the named site.
111  ///
112  /// Called per-request so the service layer can work with its
113  /// existing `&InfraContext<'_>` API without taking ownership of the
114  /// underlying [`SiteBackend`].
115  ///
116  /// # Errors
117  ///
118  /// Returns [`Error::NotFound`] when `site_name` is not in
119  /// [`Self::sites`].
120  pub fn infra_context<'a>(
121    &'a self,
122    site_name: &'a str,
123  ) -> Result<InfraContext<'a>, Error> {
124    let site = self.sites.get(site_name).ok_or_else(|| {
125      Error::NotFound(format!("site '{site_name}' not found"))
126    })?;
127    Ok(InfraContext {
128      backend: &site.backend,
129      site_name,
130      shasta_base_url: &site.shasta_base_url,
131      shasta_root_cert: &site.shasta_root_cert,
132      socks5_proxy: site.socks5_proxy.as_deref(),
133      vault_base_url: site.vault_base_url.as_deref(),
134      gitea_base_url: &site.gitea_base_url,
135      k8s_api_url: site.k8s_api_url.as_deref(),
136    })
137  }
138}
139
140/// Request-logging middleware. Logs `method uri → status` at INFO
141/// after the inner handler returns, including handler-internal
142/// error responses. Composed once by [`start_server`] around the
143/// router built by [`routes::build_router`].
144async fn log_requests(
145  request: axum::extract::Request,
146  next: axum::middleware::Next,
147) -> axum::response::Response {
148  let method = request.method().clone();
149  let uri = request.uri().clone();
150  let response = next.run(request).await;
151  tracing::info!("{} {} → {}", method, uri, response.status());
152  response
153}
154
155/// Start the HTTP or HTTPS server.
156///
157/// Builds the router via [`routes::build_router`], wraps it with the
158/// request-logging middleware, binds the listener at
159/// `<listen_addr>:<port>`, and serves until a SIGTERM or Ctrl+C is
160/// received — at which point the in-process shutdown handler
161/// triggers `axum_server`'s graceful drain with the
162/// [`ServerState::shutdown_grace_period`] window.
163///
164/// When `cert_path` and `key_path` are both `Some`, the server
165/// listens with TLS (`https://`). When both are `None`, it listens
166/// as plain HTTP. Mixing one of the two is rejected.
167///
168/// # Errors
169///
170/// - [`Error::BadRequest`] when `listen_addr:port` does not parse as
171///   a `SocketAddr`, or when exactly one of `cert_path` / `key_path`
172///   is supplied (they must be set together).
173/// - Any I/O / TLS load error from `RustlsConfig::from_pem_file` or
174///   the underlying `axum_server::bind*` call surfaces via the
175///   `From<io::Error>` impl on [`Error`].
176pub async fn start_server(
177  state: Arc<ServerState>,
178  listen_addr: &str,
179  port: u16,
180  cert_path: Option<&str>,
181  key_path: Option<&str>,
182) -> Result<(), Error> {
183  // Read shutdown-grace before `state` is moved into the router.
184  let shutdown_grace_period = state.shutdown_grace_period;
185
186  // Both `request_timeout` and `power_timeout` are now applied **inside**
187  // `build_router` so the per-route `/power` override actually wins —
188  // see the comment on `build_router` for why a global outer layer
189  // would silently defeat the override.
190  let app =
191    routes::build_router(state).layer(axum::middleware::from_fn(log_requests));
192
193  let addr: SocketAddr = format!("{listen_addr}:{port}")
194    .parse()
195    .map_err(|e| Error::BadRequest(format!("Invalid listen address: {e}")))?;
196
197  match (cert_path, key_path) {
198    (Some(cert), Some(key)) => {
199      let tls_config = RustlsConfig::from_pem_file(cert, key).await?;
200      let handle = axum_server::Handle::new();
201      let ready_handle = handle.clone();
202      tokio::spawn(async move {
203        ready_handle.listening().await;
204        tracing::info!(
205          "HTTPS server ready, accepting requests on https://{}",
206          addr
207        );
208        eprintln!("HTTPS server ready, accepting requests on https://{addr}");
209      });
210      install_shutdown_handler(handle.clone(), shutdown_grace_period);
211      axum_server::bind_rustls(addr, tls_config)
212        .handle(handle)
213        .serve(app.into_make_service_with_connect_info::<SocketAddr>())
214        .await?;
215    }
216    (None, None) => {
217      let handle = axum_server::Handle::new();
218      let ready_handle = handle.clone();
219      tokio::spawn(async move {
220        ready_handle.listening().await;
221        tracing::info!(
222          "HTTP server ready, accepting requests on http://{}",
223          addr
224        );
225        eprintln!("HTTP server ready, accepting requests on http://{addr}");
226      });
227      install_shutdown_handler(handle.clone(), shutdown_grace_period);
228      axum_server::bind(addr)
229        .handle(handle)
230        .serve(app.into_make_service_with_connect_info::<SocketAddr>())
231        .await?;
232    }
233    _ => {
234      return Err(Error::BadRequest(
235        "--cert and --key must be provided together".to_string(),
236      ));
237    }
238  }
239
240  Ok(())
241}
242
243/// Spawn a task that waits for SIGTERM or Ctrl+C and triggers
244/// `axum_server`'s graceful shutdown with a bounded drain window.
245/// Without this, the runtime drops in-flight requests when Tokio is
246/// shut down by the OS — `docker stop` / k8s pod termination would
247/// abandon clients mid-call.
248///
249/// The grace-period comes from `ServerState::shutdown_grace_period`
250/// (sourced from `server.toml`); pods that hit this without
251/// finishing get SIGKILL'd by the kubelet.
252fn install_shutdown_handler(
253  handle: axum_server::Handle<SocketAddr>,
254  grace_period: Duration,
255) {
256  tokio::spawn(async move {
257    let mut sigterm = match tokio::signal::unix::signal(
258      tokio::signal::unix::SignalKind::terminate(),
259    ) {
260      Ok(s) => s,
261      Err(e) => {
262        tracing::warn!(
263          "failed to install SIGTERM handler; falling back to Ctrl+C only: {e}"
264        );
265        let _ = tokio::signal::ctrl_c().await;
266        handle.graceful_shutdown(Some(grace_period));
267        return;
268      }
269    };
270    let grace_secs = grace_period.as_secs();
271    tokio::select! {
272      _ = sigterm.recv() => {
273        tracing::info!("SIGTERM received; draining for up to {grace_secs}s");
274      }
275      _ = tokio::signal::ctrl_c() => {
276        tracing::info!("Ctrl+C received; draining for up to {grace_secs}s");
277      }
278    }
279    handle.graceful_shutdown(Some(grace_period));
280  });
281}
282
283#[cfg(test)]
284mod timeout_layer_tests {
285  //! Behavioural tests for the global + per-route TimeoutLayer
286  //! composition used by `start_server` and
287  //! `routes::build_router::power_router`. These prove the *pattern*
288  //! (outer layer applies to all routes; an inner layer overrides for
289  //! the specific routes it wraps) — the production router relies on
290  //! exactly this composition to give `/power` more headroom than the
291  //! global default without affecting other endpoints.
292  //!
293  //! Pure tower/axum unit tests — no `ServerState`, no real handlers,
294  //! no TCP listener. `tower::ServiceExt::oneshot` drives the router
295  //! in-process.
296  use std::time::Duration;
297
298  use axum::{
299    Router,
300    body::Body,
301    http::{Request, StatusCode},
302    routing::get,
303  };
304  use tower::ServiceExt as _;
305  use tower_http::timeout::TimeoutLayer;
306
307  fn get_req(uri: &str) -> Request<Body> {
308    Request::builder()
309      .method("GET")
310      .uri(uri)
311      .body(Body::empty())
312      .unwrap()
313  }
314
315  /// Handler that sleeps `delay` then returns 200 — used to drive
316  /// the timeout layer past its limit on purpose.
317  async fn sleep_handler(delay: Duration) -> &'static str {
318    tokio::time::sleep(delay).await;
319    "ok"
320  }
321
322  #[tokio::test]
323  async fn global_timeout_returns_408_when_handler_exceeds_limit() {
324    let router = Router::new()
325      .route(
326        "/slow",
327        get(|| async { sleep_handler(Duration::from_millis(400)).await }),
328      )
329      .layer(TimeoutLayer::with_status_code(
330        StatusCode::REQUEST_TIMEOUT,
331        Duration::from_millis(50),
332      ));
333
334    let resp = router.oneshot(get_req("/slow")).await.unwrap();
335    assert_eq!(resp.status(), StatusCode::REQUEST_TIMEOUT);
336  }
337
338  #[tokio::test]
339  async fn fast_handler_finishes_before_timeout_fires() {
340    let router = Router::new()
341      .route(
342        "/fast",
343        get(|| async { sleep_handler(Duration::from_millis(10)).await }),
344      )
345      .layer(TimeoutLayer::with_status_code(
346        StatusCode::REQUEST_TIMEOUT,
347        Duration::from_secs(5),
348      ));
349
350    let resp = router.oneshot(get_req("/fast")).await.unwrap();
351    assert_eq!(resp.status(), StatusCode::OK);
352  }
353}
manta_server/server/mod.rs

manta_server/server/
mod.rs