1use std::sync::Arc;
4use std::time::Instant;
5
6use manta_backend_dispatcher::error::Error;
7use manta_backend_dispatcher::interfaces::hsm::group::GroupTrait;
8use manta_backend_dispatcher::interfaces::hsm::hardware_inventory::HardwareInventory;
9use manta_backend_dispatcher::types::NodeSummary;
10use tokio::sync::Semaphore;
11
12use crate::server::common::app_context::InfraContext;
13use crate::server::common::authorization::{
14 get_groups_names_available, validate_target_hsm_members,
15};
16use crate::server::common::node_ops;
17pub use manta_shared::shared::params::hardware::{
18 GetHardwareClusterParams, GetHardwareNodesListParams,
19};
20
21const HW_INVENTORY_CONCURRENCY_LIMIT: usize = 15;
23
24pub struct HardwareClusterResult {
28 pub hsm_group_name: String,
32 pub node_summaries: Vec<NodeSummary>,
34}
35
36async fn fetch_node_summaries(
39 infra: &InfraContext<'_>,
40 token: &str,
41 xnames: &[String],
42) -> Vec<NodeSummary> {
43 let mut tasks = tokio::task::JoinSet::new();
44 let sem = Arc::new(Semaphore::new(HW_INVENTORY_CONCURRENCY_LIMIT));
45
46 let n = xnames.len();
47 let width = n.checked_ilog10().unwrap_or(0) as usize + 1;
48
49 for (i, xname) in xnames.iter().enumerate() {
50 tracing::info!(
51 "\rGetting hw components for node '{xname}' [{:>width$}/{n}]",
52 i + 1
53 );
54
55 let backend_cp = infra.backend.clone();
56 let token_str = token.to_string();
57 let xname_str = xname.to_string();
58 let permit = Arc::clone(&sem).acquire_owned().await;
59
60 tasks.spawn(async move {
61 let _permit = permit;
62 let hw_inventory_value = backend_cp
63 .get_inventory_hardware_query(
64 &token_str, &xname_str, None, None, None, None, None,
65 )
66 .await;
67
68 let node_hw_opt = match hw_inventory_value {
69 Ok(value) => value.pointer("/Nodes/0").cloned(),
70 Err(e) => {
71 tracing::error!(
72 "Failed to get HW inventory for '{}': {}",
73 xname_str,
74 e
75 );
76 None
77 }
78 };
79
80 match node_hw_opt {
81 Some(v) => NodeSummary::from_csm_value(v),
82 None => NodeSummary {
83 xname: xname_str,
84 ..Default::default()
85 },
86 }
87 });
88 }
89
90 let mut summaries = Vec::with_capacity(n);
91 while let Some(res) = tasks.join_next().await {
92 match res {
93 Ok(s) => summaries.push(s),
94 Err(e) => {
95 tracing::error!("Failed fetching node hardware information: {}", e)
96 }
97 }
98 }
99 summaries
100}
101
102pub async fn get_hardware_cluster(
107 infra: &InfraContext<'_>,
108 token: &str,
109 params: &GetHardwareClusterParams,
110) -> Result<HardwareClusterResult, Error> {
111 let target_hsm_group_vec = get_groups_names_available(
112 infra.backend,
113 token,
114 params.hsm_group_name.as_deref(),
115 params.settings_hsm_group_name.as_deref(),
116 )
117 .await?;
118
119 let hsm_group_name = target_hsm_group_vec
120 .first()
121 .ok_or_else(|| {
122 Error::NotFound("No HSM groups available for this user".to_string())
123 })?
124 .clone();
125
126 let hsm_group = infra.backend.get_group(token, &hsm_group_name).await?;
127
128 let members = hsm_group
129 .members
130 .unwrap_or_default()
131 .ids
132 .unwrap_or_default();
133
134 if members.is_empty() {
135 tracing::warn!("HSM group '{}' has no members", hsm_group.label);
136 }
137
138 tracing::debug!(
139 "Get HW artifacts for nodes in HSM group '{}' and members {:?}",
140 hsm_group.label,
141 members
142 );
143
144 let start_total = Instant::now();
145 let node_summaries = fetch_node_summaries(infra, token, &members).await;
146 tracing::info!(
147 "Time elapsed getting hw inventory for HSM '{}': {:?}",
148 hsm_group_name,
149 start_total.elapsed()
150 );
151
152 Ok(HardwareClusterResult {
153 hsm_group_name,
154 node_summaries,
155 })
156}
157
158pub struct HardwareNodesListResult {
162 pub node_summaries: Vec<NodeSummary>,
164}
165
166pub async fn get_hardware_nodes_list(
173 infra: &InfraContext<'_>,
174 token: &str,
175 params: &GetHardwareNodesListParams,
176) -> Result<HardwareNodesListResult, Error> {
177 let xnames = node_ops::resolve_hosts_expression(
178 infra.backend,
179 token,
180 ¶ms.xnames,
181 false,
182 )
183 .await?;
184
185 if xnames.is_empty() {
186 return Err(Error::BadRequest(
187 "The list of nodes is empty. Nothing to do.".to_string(),
188 ));
189 }
190
191 validate_target_hsm_members(infra.backend, token, &xnames).await?;
192
193 let node_summaries = fetch_node_summaries(infra, token, &xnames).await;
194 Ok(HardwareNodesListResult { node_summaries })
195}
196
197#[cfg(test)]
201use manta_shared::shared::cluster_status::calculate_hsm_hw_component_summary;
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206 use manta_backend_dispatcher::types::{
207 ArtifactSummary, ArtifactType, NodeSummary,
208 };
209
210 fn make_artifact(
212 art_type: ArtifactType,
213 info: Option<&str>,
214 ) -> ArtifactSummary {
215 ArtifactSummary {
216 xname: "x0".to_string(),
217 r#type: art_type,
218 info: info.map(String::from),
219 }
220 }
221
222 #[test]
223 fn summary_counts_processors_and_accels() {
224 let nodes = vec![NodeSummary {
225 xname: "x1000c0s0b0n0".to_string(),
226 processors: vec![
227 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
228 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
229 ],
230 node_accels: vec![make_artifact(
231 ArtifactType::NodeAccel,
232 Some("NVIDIA A100"),
233 )],
234 memory: vec![],
235 node_hsn_nics: vec![],
236 ..Default::default()
237 }];
238 let summary = calculate_hsm_hw_component_summary(&nodes);
239 assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
240 assert_eq!(summary.get("NVIDIA A100"), Some(&1));
241 }
242
243 #[test]
244 fn summary_converts_memory_mib_to_gib() {
245 let nodes = vec![NodeSummary {
246 xname: "x1000c0s0b0n0".to_string(),
247 processors: vec![],
248 node_accels: vec![],
249 memory: vec![
250 ArtifactSummary {
251 xname: "x0".to_string(),
252 r#type: ArtifactType::Memory,
253 info: Some("16384 MiB".to_string()),
254 },
255 ArtifactSummary {
256 xname: "x0".to_string(),
257 r#type: ArtifactType::Memory,
258 info: Some("16384 MiB".to_string()),
259 },
260 ],
261 node_hsn_nics: vec![],
262 ..Default::default()
263 }];
264 let summary = calculate_hsm_hw_component_summary(&nodes);
265 assert_eq!(summary.get("Memory (GiB)"), Some(&32));
266 }
267
268 #[test]
269 fn summary_aggregates_across_multiple_nodes() {
270 let nodes = vec![
271 NodeSummary {
272 xname: "n1".to_string(),
273 processors: vec![make_artifact(
274 ArtifactType::Processor,
275 Some("AMD EPYC 7742"),
276 )],
277 ..Default::default()
278 },
279 NodeSummary {
280 xname: "n2".to_string(),
281 processors: vec![
282 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
283 make_artifact(ArtifactType::Processor, Some("Intel Xeon Gold")),
284 ],
285 ..Default::default()
286 },
287 ];
288 let summary = calculate_hsm_hw_component_summary(&nodes);
289 assert_eq!(summary.get("AMD EPYC 7742"), Some(&2));
290 assert_eq!(summary.get("Intel Xeon Gold"), Some(&1));
291 }
292
293 #[test]
294 fn summary_empty_nodes() {
295 let nodes: Vec<NodeSummary> = vec![];
296 let summary = calculate_hsm_hw_component_summary(&nodes);
297 assert!(summary.is_empty());
298 }
299
300 #[test]
301 fn summary_skips_none_info_in_processors() {
302 let nodes = vec![NodeSummary {
303 xname: "n1".to_string(),
304 processors: vec![
305 make_artifact(ArtifactType::Processor, None),
306 make_artifact(ArtifactType::Processor, Some("AMD EPYC 7742")),
307 ],
308 ..Default::default()
309 }];
310 let summary = calculate_hsm_hw_component_summary(&nodes);
311 assert_eq!(summary.get("AMD EPYC 7742"), Some(&1));
312 assert_eq!(summary.len(), 1);
313 }
314}