manta_server/service/
node.rs

1//! HSM node queries, registration, and deletion, with rollback on partial failure.
2
3use manta_backend_dispatcher::error::Error;
4use manta_backend_dispatcher::interfaces::hsm::{
5  component::ComponentTrait, group::GroupTrait,
6  hardware_inventory::HardwareInventory,
7};
8use manta_backend_dispatcher::types::{
9  ComponentArrayPostArray, ComponentCreate, HWInventoryByLocationList,
10};
11use manta_shared::types::dto::NodeDetails;
12use std::path::PathBuf;
13
14use crate::server::common::app_context::InfraContext;
15use crate::service::authorization::validate_user_group_members_access;
16use crate::service::node_details;
17use crate::service::node_ops::from_user_hosts_expression_to_xname_vec;
18pub use manta_shared::types::api::node::GetNodesParams;
19
20/// Fetch HSM node details for the targets named by
21/// `params.host_expression`.
22///
23/// The expression is parsed by
24/// [`crate::service::node_ops::from_user_hosts_expression_to_xname_vec`];
25/// when `params.include_siblings` is set, the resulting xnames are
26/// expanded to cover every node on the same BMC. Access to the
27/// resolved set is validated before the (relatively slow) per-node
28/// detail fetch. The optional `status_filter` matches case-insensitively
29/// against either the power or configuration status. Results are
30/// sorted by xname for stable output.
31pub async fn get_nodes(
32  infra: &InfraContext<'_>,
33  token: &str,
34  params: &GetNodesParams,
35) -> Result<Vec<NodeDetails>, Error> {
36  let node_list = from_user_hosts_expression_to_xname_vec(
37    infra,
38    token,
39    &params.host_expression,
40    params.include_siblings,
41  )
42  .await?;
43
44  if node_list.is_empty() {
45    return Err(Error::BadRequest(
46      "The list of nodes to operate is empty. Nothing to do".to_string(),
47    ));
48  }
49
50  // Validate xnames
51  validate_user_group_members_access(infra, token, &node_list).await?;
52
53  let mut node_details_list =
54    node_details::get_node_details(infra, token, &node_list).await?;
55
56  // Apply status filter
57  if let Some(ref status) = params.status_filter {
58    node_details_list.retain(|nd| {
59      nd.power_status.eq_ignore_ascii_case(status)
60        || nd.configuration_status.eq_ignore_ascii_case(status)
61    });
62  }
63
64  node_details_list.sort_by(|a, b| a.xname.cmp(&b.xname));
65
66  Ok(node_details_list)
67}
68
69// `compute_summary_status` moved to `manta_shared::types::cluster_status` —
70// only CLI display code calls it.
71
72/// Remove the HSM component with id `id` (typically an xname).
73///
74/// The caller's group access to the node is validated before the
75/// delete is dispatched.
76pub async fn delete_node(
77  infra: &InfraContext<'_>,
78  token: &str,
79  id: &str,
80) -> Result<(), Error> {
81  validate_user_group_members_access(infra, token, &[id.to_string()]).await?;
82
83  infra.backend.delete_node(token, id).await.map(|_| ())
84}
85
86/// Register a new HSM component, attach an optional hardware
87/// inventory file, and add it to the named group.
88///
89/// The flow is three writes: `post_nodes`, then (if
90/// `hardware_file_path` is supplied) `post_inventory_hardware` after
91/// parsing the JSON file, then `post_member`. Each failure after the
92/// initial create rolls back by deleting the node, so a partial
93/// failure does not leave a stub component behind. Hardware-file
94/// parse errors are reported as the original IO / serde error, with
95/// the same rollback applied.
96pub async fn add_node(
97  infra: &InfraContext<'_>,
98  token: &str,
99  id: &str,
100  group: &str,
101  enabled: bool,
102  arch_opt: Option<String>,
103  hardware_file_path: Option<&PathBuf>,
104) -> Result<(), Error> {
105  validate_user_group_members_access(infra, token, &[id.to_string()]).await?;
106
107  // Create node
108  let component = ComponentCreate {
109    id: id.to_string(),
110    state: "Unknown".to_string(),
111    flag: None,
112    enabled: Some(enabled),
113    software_status: None,
114    role: None,
115    sub_role: None,
116    nid: None,
117    subtype: None,
118    net_type: None,
119    arch: arch_opt,
120    class: None,
121  };
122
123  let components = ComponentArrayPostArray {
124    components: vec![component],
125    force: Some(true),
126  };
127
128  infra.backend.post_nodes(token, components).await?;
129
130  tracing::info!("Node saved '{}'", id);
131
132  // Parse and add hardware inventory if provided.
133  //
134  // HW inventory files are operator-supplied JSON that can run to
135  // several MB. Reading them with the sync `std::fs::File` +
136  // `serde_json::from_reader` chain parked the Tokio worker for the
137  // duration of the read and parse, stalling unrelated requests
138  // queued behind it on the same worker. `tokio::fs::read` does the
139  // I/O on a blocking pool; the in-memory `from_slice` parse stays
140  // on the worker but is bounded by file size.
141  let hw_inventory_opt: Option<HWInventoryByLocationList> =
142    if let Some(hardware_file) = hardware_file_path {
143      match read_hw_inventory(hardware_file).await {
144        Ok(inv) => Some(inv),
145        Err(e) => {
146          rollback_node(infra, token, id).await;
147          return Err(e);
148        }
149      }
150    } else {
151      None
152    };
153
154  if let Some(hw_inventory) = hw_inventory_opt {
155    tracing::info!("Adding hardware inventory for '{}'", id);
156    if let Err(error) = infra
157      .backend
158      .post_inventory_hardware(token, hw_inventory)
159      .await
160      .map(|_| ())
161    {
162      rollback_node(infra, token, id).await;
163      return Err(error);
164    }
165  }
166
167  // Add node to group
168  if let Err(error) = infra
169    .backend
170    .post_member(token, group, id)
171    .await
172    .map(|_| ())
173  {
174    rollback_node(infra, token, id).await;
175    return Err(error);
176  }
177
178  Ok(())
179}
180
181/// Read and parse a hardware-inventory JSON file off the Tokio
182/// reactor. The two-step `Value` → `from_value` round-trip is kept
183/// so the surfaced parse error still names the bad field (csm-rs
184/// uses `#[serde(rename = "ID")]` etc. and the direct `from_slice`
185/// path produces less helpful errors when a key is mistyped).
186async fn read_hw_inventory(
187  path: &PathBuf,
188) -> Result<HWInventoryByLocationList, Error> {
189  let bytes = tokio::fs::read(path).await?;
190  let value: serde_json::Value = serde_json::from_slice(&bytes)?;
191  let inv = serde_json::from_value::<HWInventoryByLocationList>(value)?;
192  Ok(inv)
193}
194
195/// Rollback helper: attempt to delete a node that was partially created.
196async fn rollback_node(infra: &InfraContext<'_>, token: &str, id: &str) {
197  tracing::warn!("Rolling back: attempting to delete node '{}'", id);
198  let delete_node_rslt = infra.backend.delete_node(token, id).await;
199  if delete_node_rslt.is_ok() {
200    tracing::info!("Rollback: node '{}' deleted", id);
201  }
202}