manta_server/service/
hardware.rs

1//! Hardware inventory queries for individual nodes and clusters, with concurrent fetching.
2
3use std::sync::Arc;
4use std::time::Instant;
5
6use manta_backend_dispatcher::error::Error;
7use manta_backend_dispatcher::interfaces::hsm::group::GroupTrait;
8use manta_backend_dispatcher::interfaces::hsm::{
9  component::ComponentTrait, hardware_inventory::HardwareInventory,
10};
11use manta_backend_dispatcher::types::NodeSummary;
12use tokio::sync::Semaphore;
13
14use crate::server::common::app_context::InfraContext;
15use crate::service::authorization::{
16  validate_user_group_members_access, validate_user_group_vec_access,
17};
18use crate::service::node_ops::from_hosts_expression_to_xname_vec;
19pub use manta_shared::types::api::hardware::{
20  GetHardwareClusterParams, GetHardwareNodesListParams,
21};
22
23/// Maximum number of concurrent hardware inventory requests.
24const HW_INVENTORY_CONCURRENCY_LIMIT: usize = 15;
25
26// ── Hardware Cluster ──
27
28/// Result of a hardware cluster query.
29pub struct HardwareClusterResult {
30  /// Resolved HSM group name the inventory was collected for (may
31  /// differ from the requested name if the caller's authorization
32  /// only permitted a subset).
33  pub hsm_group_name: String,
34  /// Per-node hardware summaries, one entry per group member.
35  pub node_summaries: Vec<NodeSummary>,
36}
37
38/// Fetch hardware inventory for a slice of xnames concurrently,
39/// rate-limited by a semaphore. Shared by cluster and nodes-list queries.
40async fn fetch_node_summaries(
41  infra: &InfraContext<'_>,
42  token: &str,
43  xnames: &[String],
44) -> Vec<NodeSummary> {
45  let mut tasks = tokio::task::JoinSet::new();
46  let sem = Arc::new(Semaphore::new(HW_INVENTORY_CONCURRENCY_LIMIT));
47
48  let n = xnames.len();
49  let width = n.checked_ilog10().unwrap_or(0) as usize + 1;
50
51  for (i, xname) in xnames.iter().enumerate() {
52    tracing::info!(
53      "\rGetting hw components for node '{xname}' [{:>width$}/{n}]",
54      i + 1
55    );
56
57    let backend_cp = infra.backend_clone();
58    let token_str = token.to_string();
59    let xname_str = xname.clone();
60    let permit = Arc::clone(&sem).acquire_owned().await;
61
62    tasks.spawn(async move {
63      let _permit = permit;
64      let hw_inventory_typed = backend_cp
65        .get_inventory_hardware_query(
66          &token_str, &xname_str, None, None, None, None, None,
67        )
68        .await;
69
70      // `NodeSummary::from_csm_value` still parses out of a JSON Value;
71      // re-serialize the typed `HWInventory` and pluck `/Nodes/0` like
72      // before. A future cleanup can replace this round-trip with a
73      // typed constructor that takes `&HWInventory` directly.
74      let node_hw_opt = match hw_inventory_typed {
75        Ok(hw_inv) => serde_json::to_value(&hw_inv)
76          .ok()
77          .and_then(|v| v.pointer("/Nodes/0").cloned()),
78        Err(e) => {
79          tracing::error!(
80            "Failed to get HW inventory for '{}': {}",
81            xname_str,
82            e
83          );
84          None
85        }
86      };
87
88      match node_hw_opt {
89        Some(v) => NodeSummary::from_csm_value(v),
90        None => NodeSummary {
91          xname: xname_str,
92          ..Default::default()
93        },
94      }
95    });
96  }
97
98  let mut summaries = Vec::with_capacity(n);
99  while let Some(res) = tasks.join_next().await {
100    match res {
101      Ok(s) => summaries.push(s),
102      Err(e) => {
103        tracing::error!("Failed fetching node hardware information: {}", e);
104      }
105    }
106  }
107  summaries
108}
109
110/// Fetch hardware inventory for every member of an HSM group.
111///
112/// When `params.group_name` is unset, the first group the caller has
113/// access to is used and surfaced back through
114/// `HardwareClusterResult::hsm_group_name`. Per-node inventory
115/// queries run concurrently, capped by `HW_INVENTORY_CONCURRENCY_LIMIT`.
116/// Empty groups are logged but not treated as an error.
117pub async fn get_hardware_cluster(
118  infra: &InfraContext<'_>,
119  token: &str,
120  params: &GetHardwareClusterParams,
121) -> Result<HardwareClusterResult, Error> {
122  // Get list of target groups the user is asking for
123  let target_group_vec: Vec<String> = if let Some(group) = &params.group_name {
124    vec![group.clone()]
125  } else {
126    infra
127      .backend
128      .get_group_available(token)
129      .await?
130      .iter()
131      .map(|group| group.label.clone())
132      .collect()
133  };
134
135  // Validate groups and get list of groups available
136  validate_user_group_vec_access(infra, token, &target_group_vec).await?;
137
138  let hsm_group_name = target_group_vec
139    .first()
140    .ok_or_else(|| {
141      Error::NotFound("No HSM groups available for this user".to_string())
142    })?
143    .clone();
144
145  let hsm_group = infra.backend.get_group(token, &hsm_group_name).await?;
146
147  let members = hsm_group
148    .members
149    .unwrap_or_default()
150    .ids
151    .unwrap_or_default();
152
153  if members.is_empty() {
154    tracing::warn!("HSM group '{}' has no members", hsm_group.label);
155  }
156
157  tracing::debug!(
158    "Get HW artifacts for nodes in HSM group '{}' and members {:?}",
159    hsm_group.label,
160    members
161  );
162
163  let start_total = Instant::now();
164  let node_summaries = fetch_node_summaries(infra, token, &members).await;
165  tracing::info!(
166    "Time elapsed getting hw inventory for HSM '{}': {:?}",
167    hsm_group_name,
168    start_total.elapsed()
169  );
170
171  Ok(HardwareClusterResult {
172    hsm_group_name,
173    node_summaries,
174  })
175}
176
177// ── Hardware Nodes List ──
178
179/// Result of a hardware nodes-list query.
180pub struct HardwareNodesListResult {
181  /// Per-node hardware summaries, one entry per resolved xname.
182  pub node_summaries: Vec<NodeSummary>,
183}
184
185/// Fetch hardware inventory for the nodes named by
186/// `params.host_expression`.
187///
188/// The expression is parsed by [`from_hosts_expression_to_xname_vec`]
189/// (hostlist notation, NIDs, or xnames; siblings are not expanded
190/// here). An empty resolution yields `BadRequest` rather than a
191/// silent no-op. The caller's group access to every resolved xname is
192/// validated through [`validate_user_group_members_access`] before
193/// the per-node inventory fan-out runs.
194pub async fn get_hardware_nodes_list(
195  infra: &InfraContext<'_>,
196  token: &str,
197  params: &GetHardwareNodesListParams,
198) -> Result<HardwareNodesListResult, Error> {
199  let node_metadata_available_vec =
200    infra.backend.get_node_metadata_available(token).await?;
201
202  let node_list = from_hosts_expression_to_xname_vec(
203    &params.host_expression,
204    false,
205    &node_metadata_available_vec,
206  )?;
207
208  if node_list.is_empty() {
209    return Err(Error::BadRequest(
210      "The list of nodes to operate is empty. Nothing to do".to_string(),
211    ));
212  }
213
214  // Validate xnames
215  validate_user_group_members_access(infra, token, &node_list).await?;
216
217  let node_summaries = fetch_node_summaries(infra, token, &node_list).await;
218  Ok(HardwareNodesListResult { node_summaries })
219}
220
221// `calculate_group_hw_component_summary` and `get_cluster_hw_pattern` moved
222// to `manta_shared::types::cluster_status`. Only
223// `calculate_group_hw_component_summary` is still needed locally — the
224// tests below use it.
225#[cfg(test)]
226use manta_shared::types::cluster_status::calculate_group_hw_component_summary;
227
228#[cfg(test)]
229mod tests {
230  use super::*;
231  use manta_backend_dispatcher::types::{
232    ArtifactSummary, ArtifactType, NodeSummary,
233  };
234
235  /// Helper: create an ArtifactSummary with the given info string.
236  fn make_artifact(
237    art_type: ArtifactType,
238    info: Option<&str>,
239  ) -> ArtifactSummary {
240    ArtifactSummary {
241      xname: "x0".to_string(),
242      r#type: art_type,
243      info: info.map(String::from),
244    }
245  }
246
247  #[test]
248  fn summary_counts_processors_and_accels() {
249    let nodes = vec![NodeSummary {
250      xname: "x1000c0s0b0n0".to_string(),
251      processors: vec![
252        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
253        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
254      ],
255      node_accels: vec![make_artifact(
256        ArtifactType::NodeAccel,
257        Some("NVIDIA A100"),
258      )],
259      memory: vec![],
260      node_hsn_nics: vec![],
261      ..Default::default()
262    }];
263    let summary = calculate_group_hw_component_summary(&nodes);
264    assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
265    assert_eq!(summary.get("NVIDIA A100"), Some(&1));
266  }
267
268  #[test]
269  fn summary_converts_memory_mib_to_gib() {
270    let nodes = vec![NodeSummary {
271      xname: "x1000c0s0b0n0".to_string(),
272      processors: vec![],
273      node_accels: vec![],
274      memory: vec![
275        ArtifactSummary {
276          xname: "x0".to_string(),
277          r#type: ArtifactType::Memory,
278          info: Some("16384 MiB".to_string()),
279        },
280        ArtifactSummary {
281          xname: "x0".to_string(),
282          r#type: ArtifactType::Memory,
283          info: Some("16384 MiB".to_string()),
284        },
285      ],
286      node_hsn_nics: vec![],
287      ..Default::default()
288    }];
289    let summary = calculate_group_hw_component_summary(&nodes);
290    assert_eq!(summary.get("Memory (GiB)"), Some(&32));
291  }
292
293  #[test]
294  fn summary_aggregates_across_multiple_nodes() {
295    let nodes = vec![
296      NodeSummary {
297        xname: "n1".to_string(),
298        processors: vec![make_artifact(
299          ArtifactType::Processor,
300          Some("AMD EPYC 7742"),
301        )],
302        ..Default::default()
303      },
304      NodeSummary {
305        xname: "n2".to_string(),
306        processors: vec![
307          make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
308          make_artifact(ArtifactType::Processor, Some("Intel Xeon Gold")),
309        ],
310        ..Default::default()
311      },
312    ];
313    let summary = calculate_group_hw_component_summary(&nodes);
314    assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
315    assert_eq!(summary.get("Intel Xeon Gold"), Some(&1));
316  }
317
318  #[test]
319  fn summary_empty_nodes() {
320    let nodes: Vec<NodeSummary> = vec![];
321    let summary = calculate_group_hw_component_summary(&nodes);
322    assert!(summary.is_empty());
323  }
324
325  #[test]
326  fn summary_skips_none_info_in_processors() {
327    let nodes = vec![NodeSummary {
328      xname: "n1".to_string(),
329      processors: vec![
330        make_artifact(ArtifactType::Processor, None),
331        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
332      ],
333      ..Default::default()
334    }];
335    let summary = calculate_group_hw_component_summary(&nodes);
336    assert_eq!(summary.get("AMD EPYC 7742"), Some(&1));
337    assert_eq!(summary.len(), 1);
338  }
339}