manta_server/service/
hardware.rs

1//! Hardware inventory queries for individual nodes and clusters, with concurrent fetching.
2
3use std::sync::Arc;
4use std::time::Instant;
5
6use manta_backend_dispatcher::error::Error;
7use manta_backend_dispatcher::interfaces::hsm::group::GroupTrait;
8use manta_backend_dispatcher::interfaces::hsm::hardware_inventory::HardwareInventory;
9use manta_backend_dispatcher::types::NodeSummary;
10use tokio::sync::Semaphore;
11
12use crate::server::common::app_context::InfraContext;
13use crate::server::common::authorization::{
14  get_groups_names_available, validate_target_hsm_members,
15};
16use crate::server::common::node_ops;
17pub use manta_shared::shared::params::hardware::{
18  GetHardwareClusterParams, GetHardwareNodesListParams,
19};
20
21/// Maximum number of concurrent hardware inventory requests.
22const HW_INVENTORY_CONCURRENCY_LIMIT: usize = 15;
23
24// ── Hardware Cluster ──
25
26/// Result of a hardware cluster query.
27pub struct HardwareClusterResult {
28  /// Resolved HSM group name the inventory was collected for (may
29  /// differ from the requested name if the caller's authorization
30  /// only permitted a subset).
31  pub hsm_group_name: String,
32  /// Per-node hardware summaries, one entry per group member.
33  pub node_summaries: Vec<NodeSummary>,
34}
35
36/// Fetch hardware inventory for a slice of xnames concurrently,
37/// rate-limited by a semaphore. Shared by cluster and nodes-list queries.
38async fn fetch_node_summaries(
39  infra: &InfraContext<'_>,
40  token: &str,
41  xnames: &[String],
42) -> Vec<NodeSummary> {
43  let mut tasks = tokio::task::JoinSet::new();
44  let sem = Arc::new(Semaphore::new(HW_INVENTORY_CONCURRENCY_LIMIT));
45
46  let n = xnames.len();
47  let width = n.checked_ilog10().unwrap_or(0) as usize + 1;
48
49  for (i, xname) in xnames.iter().enumerate() {
50    tracing::info!(
51      "\rGetting hw components for node '{xname}' [{:>width$}/{n}]",
52      i + 1
53    );
54
55    let backend_cp = infra.backend.clone();
56    let token_str = token.to_string();
57    let xname_str = xname.to_string();
58    let permit = Arc::clone(&sem).acquire_owned().await;
59
60    tasks.spawn(async move {
61      let _permit = permit;
62      let hw_inventory_value = backend_cp
63        .get_inventory_hardware_query(
64          &token_str, &xname_str, None, None, None, None, None,
65        )
66        .await;
67
68      let node_hw_opt = match hw_inventory_value {
69        Ok(value) => value.pointer("/Nodes/0").cloned(),
70        Err(e) => {
71          tracing::error!(
72            "Failed to get HW inventory for '{}': {}",
73            xname_str,
74            e
75          );
76          None
77        }
78      };
79
80      match node_hw_opt {
81        Some(v) => NodeSummary::from_csm_value(v),
82        None => NodeSummary {
83          xname: xname_str,
84          ..Default::default()
85        },
86      }
87    });
88  }
89
90  let mut summaries = Vec::with_capacity(n);
91  while let Some(res) = tasks.join_next().await {
92    match res {
93      Ok(s) => summaries.push(s),
94      Err(e) => {
95        tracing::error!("Failed fetching node hardware information: {}", e)
96      }
97    }
98  }
99  summaries
100}
101
102/// Fetch hardware inventory for all nodes in a cluster (HSM group).
103///
104/// Concurrently queries hardware inventory for each node, rate-limited
105/// by a semaphore.
106pub async fn get_hardware_cluster(
107  infra: &InfraContext<'_>,
108  token: &str,
109  params: &GetHardwareClusterParams,
110) -> Result<HardwareClusterResult, Error> {
111  let target_hsm_group_vec = get_groups_names_available(
112    infra.backend,
113    token,
114    params.hsm_group_name.as_deref(),
115    params.settings_hsm_group_name.as_deref(),
116  )
117  .await?;
118
119  let hsm_group_name = target_hsm_group_vec
120    .first()
121    .ok_or_else(|| {
122      Error::NotFound("No HSM groups available for this user".to_string())
123    })?
124    .clone();
125
126  let hsm_group = infra.backend.get_group(token, &hsm_group_name).await?;
127
128  let members = hsm_group
129    .members
130    .unwrap_or_default()
131    .ids
132    .unwrap_or_default();
133
134  if members.is_empty() {
135    tracing::warn!("HSM group '{}' has no members", hsm_group.label);
136  }
137
138  tracing::debug!(
139    "Get HW artifacts for nodes in HSM group '{}' and members {:?}",
140    hsm_group.label,
141    members
142  );
143
144  let start_total = Instant::now();
145  let node_summaries = fetch_node_summaries(infra, token, &members).await;
146  tracing::info!(
147    "Time elapsed getting hw inventory for HSM '{}': {:?}",
148    hsm_group_name,
149    start_total.elapsed()
150  );
151
152  Ok(HardwareClusterResult {
153    hsm_group_name,
154    node_summaries,
155  })
156}
157
158// ── Hardware Nodes List ──
159
160/// Result of a hardware nodes-list query.
161pub struct HardwareNodesListResult {
162  /// Per-node hardware summaries, one entry per resolved xname.
163  pub node_summaries: Vec<NodeSummary>,
164}
165
166/// Fetch hardware inventory for an explicit node expression.
167///
168/// The expression is resolved via `resolve_hosts_expression`, which expands
169/// hostlist notation, translates NIDs to xnames, and validates that every
170/// resolved node actually exists. Authorization is then checked with
171/// `validate_target_hsm_members`.
172pub async fn get_hardware_nodes_list(
173  infra: &InfraContext<'_>,
174  token: &str,
175  params: &GetHardwareNodesListParams,
176) -> Result<HardwareNodesListResult, Error> {
177  let xnames = node_ops::resolve_hosts_expression(
178    infra.backend,
179    token,
180    &params.xnames,
181    false,
182  )
183  .await?;
184
185  if xnames.is_empty() {
186    return Err(Error::BadRequest(
187      "The list of nodes is empty. Nothing to do.".to_string(),
188    ));
189  }
190
191  validate_target_hsm_members(infra.backend, token, &xnames).await?;
192
193  let node_summaries = fetch_node_summaries(infra, token, &xnames).await;
194  Ok(HardwareNodesListResult { node_summaries })
195}
196
197// `calculate_hsm_hw_component_summary` and `get_cluster_hw_pattern` moved
198// to `manta_shared::shared::cluster_status`. Only `calculate_hsm_hw_component_summary`
199// is still needed locally — the tests below use it.
200#[cfg(test)]
201use manta_shared::shared::cluster_status::calculate_hsm_hw_component_summary;
202
203#[cfg(test)]
204mod tests {
205  use super::*;
206  use manta_backend_dispatcher::types::{
207    ArtifactSummary, ArtifactType, NodeSummary,
208  };
209
210  /// Helper: create an ArtifactSummary with the given info string.
211  fn make_artifact(
212    art_type: ArtifactType,
213    info: Option<&str>,
214  ) -> ArtifactSummary {
215    ArtifactSummary {
216      xname: "x0".to_string(),
217      r#type: art_type,
218      info: info.map(String::from),
219    }
220  }
221
222  #[test]
223  fn summary_counts_processors_and_accels() {
224    let nodes = vec![NodeSummary {
225      xname: "x1000c0s0b0n0".to_string(),
226      processors: vec![
227        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
228        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
229      ],
230      node_accels: vec![make_artifact(
231        ArtifactType::NodeAccel,
232        Some("NVIDIA A100"),
233      )],
234      memory: vec![],
235      node_hsn_nics: vec![],
236      ..Default::default()
237    }];
238    let summary = calculate_hsm_hw_component_summary(&nodes);
239    assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
240    assert_eq!(summary.get("NVIDIA A100"), Some(&1));
241  }
242
243  #[test]
244  fn summary_converts_memory_mib_to_gib() {
245    let nodes = vec![NodeSummary {
246      xname: "x1000c0s0b0n0".to_string(),
247      processors: vec![],
248      node_accels: vec![],
249      memory: vec![
250        ArtifactSummary {
251          xname: "x0".to_string(),
252          r#type: ArtifactType::Memory,
253          info: Some("16384 MiB".to_string()),
254        },
255        ArtifactSummary {
256          xname: "x0".to_string(),
257          r#type: ArtifactType::Memory,
258          info: Some("16384 MiB".to_string()),
259        },
260      ],
261      node_hsn_nics: vec![],
262      ..Default::default()
263    }];
264    let summary = calculate_hsm_hw_component_summary(&nodes);
265    assert_eq!(summary.get("Memory (GiB)"), Some(&32));
266  }
267
268  #[test]
269  fn summary_aggregates_across_multiple_nodes() {
270    let nodes = vec![
271      NodeSummary {
272        xname: "n1".to_string(),
273        processors: vec![make_artifact(
274          ArtifactType::Processor,
275          Some("AMD EPYC 7742"),
276        )],
277        ..Default::default()
278      },
279      NodeSummary {
280        xname: "n2".to_string(),
281        processors: vec![
282          make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
283          make_artifact(ArtifactType::Processor, Some("Intel Xeon Gold")),
284        ],
285        ..Default::default()
286      },
287    ];
288    let summary = calculate_hsm_hw_component_summary(&nodes);
289    assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
290    assert_eq!(summary.get("Intel Xeon Gold"), Some(&1));
291  }
292
293  #[test]
294  fn summary_empty_nodes() {
295    let nodes: Vec<NodeSummary> = vec![];
296    let summary = calculate_hsm_hw_component_summary(&nodes);
297    assert!(summary.is_empty());
298  }
299
300  #[test]
301  fn summary_skips_none_info_in_processors() {
302    let nodes = vec![NodeSummary {
303      xname: "n1".to_string(),
304      processors: vec![
305        make_artifact(ArtifactType::Processor, None),
306        make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
307      ],
308      ..Default::default()
309    }];
310    let summary = calculate_hsm_hw_component_summary(&nodes);
311    assert_eq!(summary.get("AMD EPYC 7742"), Some(&1));
312    assert_eq!(summary.len(), 1);
313  }
314}