1use std::sync::Arc;
4use std::time::Instant;
5
6use manta_backend_dispatcher::error::Error;
7use manta_backend_dispatcher::interfaces::hsm::group::GroupTrait;
8use manta_backend_dispatcher::interfaces::hsm::{
9 component::ComponentTrait, hardware_inventory::HardwareInventory,
10};
11use manta_backend_dispatcher::types::NodeSummary;
12use tokio::sync::Semaphore;
13
14use crate::server::common::app_context::InfraContext;
15use crate::service::authorization::{
16 validate_user_group_members_access, validate_user_group_vec_access,
17};
18use crate::service::node_ops::from_hosts_expression_to_xname_vec;
19pub use manta_shared::types::api::hardware::{
20 GetHardwareClusterParams, GetHardwareNodesListParams,
21};
22
23const HW_INVENTORY_CONCURRENCY_LIMIT: usize = 15;
25
26pub struct HardwareClusterResult {
30 pub hsm_group_name: String,
34 pub node_summaries: Vec<NodeSummary>,
36}
37
38async fn fetch_node_summaries(
41 infra: &InfraContext<'_>,
42 token: &str,
43 xnames: &[String],
44) -> Vec<NodeSummary> {
45 let mut tasks = tokio::task::JoinSet::new();
46 let sem = Arc::new(Semaphore::new(HW_INVENTORY_CONCURRENCY_LIMIT));
47
48 let n = xnames.len();
49 let width = n.checked_ilog10().unwrap_or(0) as usize + 1;
50
51 for (i, xname) in xnames.iter().enumerate() {
52 tracing::info!(
53 "\rGetting hw components for node '{xname}' [{:>width$}/{n}]",
54 i + 1
55 );
56
57 let backend_cp = infra.backend_clone();
58 let token_str = token.to_string();
59 let xname_str = xname.clone();
60 let permit = Arc::clone(&sem).acquire_owned().await;
61
62 tasks.spawn(async move {
63 let _permit = permit;
64 let hw_inventory_typed = backend_cp
65 .get_inventory_hardware_query(
66 &token_str, &xname_str, None, None, None, None, None,
67 )
68 .await;
69
70 let node_hw_opt = match hw_inventory_typed {
75 Ok(hw_inv) => serde_json::to_value(&hw_inv)
76 .ok()
77 .and_then(|v| v.pointer("/Nodes/0").cloned()),
78 Err(e) => {
79 tracing::error!(
80 "Failed to get HW inventory for '{}': {}",
81 xname_str,
82 e
83 );
84 None
85 }
86 };
87
88 match node_hw_opt {
89 Some(v) => NodeSummary::from_csm_value(v),
90 None => NodeSummary {
91 xname: xname_str,
92 ..Default::default()
93 },
94 }
95 });
96 }
97
98 let mut summaries = Vec::with_capacity(n);
99 while let Some(res) = tasks.join_next().await {
100 match res {
101 Ok(s) => summaries.push(s),
102 Err(e) => {
103 tracing::error!("Failed fetching node hardware information: {}", e);
104 }
105 }
106 }
107 summaries
108}
109
110pub async fn get_hardware_cluster(
118 infra: &InfraContext<'_>,
119 token: &str,
120 params: &GetHardwareClusterParams,
121) -> Result<HardwareClusterResult, Error> {
122 let target_group_vec: Vec<String> = if let Some(group) = ¶ms.group_name {
124 vec![group.clone()]
125 } else {
126 infra
127 .backend
128 .get_group_available(token)
129 .await?
130 .iter()
131 .map(|group| group.label.clone())
132 .collect()
133 };
134
135 validate_user_group_vec_access(infra, token, &target_group_vec).await?;
137
138 let hsm_group_name = target_group_vec
139 .first()
140 .ok_or_else(|| {
141 Error::NotFound("No HSM groups available for this user".to_string())
142 })?
143 .clone();
144
145 let hsm_group = infra.backend.get_group(token, &hsm_group_name).await?;
146
147 let members = hsm_group
148 .members
149 .unwrap_or_default()
150 .ids
151 .unwrap_or_default();
152
153 if members.is_empty() {
154 tracing::warn!("HSM group '{}' has no members", hsm_group.label);
155 }
156
157 tracing::debug!(
158 "Get HW artifacts for nodes in HSM group '{}' and members {:?}",
159 hsm_group.label,
160 members
161 );
162
163 let start_total = Instant::now();
164 let node_summaries = fetch_node_summaries(infra, token, &members).await;
165 tracing::info!(
166 "Time elapsed getting hw inventory for HSM '{}': {:?}",
167 hsm_group_name,
168 start_total.elapsed()
169 );
170
171 Ok(HardwareClusterResult {
172 hsm_group_name,
173 node_summaries,
174 })
175}
176
177pub struct HardwareNodesListResult {
181 pub node_summaries: Vec<NodeSummary>,
183}
184
185pub async fn get_hardware_nodes_list(
195 infra: &InfraContext<'_>,
196 token: &str,
197 params: &GetHardwareNodesListParams,
198) -> Result<HardwareNodesListResult, Error> {
199 let node_metadata_available_vec =
200 infra.backend.get_node_metadata_available(token).await?;
201
202 let node_list = from_hosts_expression_to_xname_vec(
203 ¶ms.host_expression,
204 false,
205 &node_metadata_available_vec,
206 )?;
207
208 if node_list.is_empty() {
209 return Err(Error::BadRequest(
210 "The list of nodes to operate is empty. Nothing to do".to_string(),
211 ));
212 }
213
214 validate_user_group_members_access(infra, token, &node_list).await?;
216
217 let node_summaries = fetch_node_summaries(infra, token, &node_list).await;
218 Ok(HardwareNodesListResult { node_summaries })
219}
220
221#[cfg(test)]
226use manta_shared::types::cluster_status::calculate_group_hw_component_summary;
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231 use manta_backend_dispatcher::types::{
232 ArtifactSummary, ArtifactType, NodeSummary,
233 };
234
235 fn make_artifact(
237 art_type: ArtifactType,
238 info: Option<&str>,
239 ) -> ArtifactSummary {
240 ArtifactSummary {
241 xname: "x0".to_string(),
242 r#type: art_type,
243 info: info.map(String::from),
244 }
245 }
246
247 #[test]
248 fn summary_counts_processors_and_accels() {
249 let nodes = vec![NodeSummary {
250 xname: "x1000c0s0b0n0".to_string(),
251 processors: vec![
252 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
253 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
254 ],
255 node_accels: vec![make_artifact(
256 ArtifactType::NodeAccel,
257 Some("NVIDIA A100"),
258 )],
259 memory: vec![],
260 node_hsn_nics: vec![],
261 ..Default::default()
262 }];
263 let summary = calculate_group_hw_component_summary(&nodes);
264 assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
265 assert_eq!(summary.get("NVIDIA A100"), Some(&1));
266 }
267
268 #[test]
269 fn summary_converts_memory_mib_to_gib() {
270 let nodes = vec![NodeSummary {
271 xname: "x1000c0s0b0n0".to_string(),
272 processors: vec![],
273 node_accels: vec![],
274 memory: vec![
275 ArtifactSummary {
276 xname: "x0".to_string(),
277 r#type: ArtifactType::Memory,
278 info: Some("16384 MiB".to_string()),
279 },
280 ArtifactSummary {
281 xname: "x0".to_string(),
282 r#type: ArtifactType::Memory,
283 info: Some("16384 MiB".to_string()),
284 },
285 ],
286 node_hsn_nics: vec![],
287 ..Default::default()
288 }];
289 let summary = calculate_group_hw_component_summary(&nodes);
290 assert_eq!(summary.get("Memory (GiB)"), Some(&32));
291 }
292
293 #[test]
294 fn summary_aggregates_across_multiple_nodes() {
295 let nodes = vec![
296 NodeSummary {
297 xname: "n1".to_string(),
298 processors: vec![make_artifact(
299 ArtifactType::Processor,
300 Some("AMD EPYC 7742"),
301 )],
302 ..Default::default()
303 },
304 NodeSummary {
305 xname: "n2".to_string(),
306 processors: vec![
307 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
308 make_artifact(ArtifactType::Processor, Some("Intel Xeon Gold")),
309 ],
310 ..Default::default()
311 },
312 ];
313 let summary = calculate_group_hw_component_summary(&nodes);
314 assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
315 assert_eq!(summary.get("Intel Xeon Gold"), Some(&1));
316 }
317
318 #[test]
319 fn summary_empty_nodes() {
320 let nodes: Vec<NodeSummary> = vec![];
321 let summary = calculate_group_hw_component_summary(&nodes);
322 assert!(summary.is_empty());
323 }
324
325 #[test]
326 fn summary_skips_none_info_in_processors() {
327 let nodes = vec![NodeSummary {
328 xname: "n1".to_string(),
329 processors: vec![
330 make_artifact(ArtifactType::Processor, None),
331 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
332 ],
333 ..Default::default()
334 }];
335 let summary = calculate_group_hw_component_summary(&nodes);
336 assert_eq!(summary.get("AMD EPYC 7742"), Some(&1));
337 assert_eq!(summary.len(), 1);
338 }
339}