From 56667fbf299526d47820e737ee9f431cc9c2105d Mon Sep 17 00:00:00 2001 From: Diwakar Sharma Date: Mon, 14 Oct 2024 09:39:17 +0000 Subject: [PATCH] feat: connect over tcp if initiator isn't rdma capable Signed-off-by: Diwakar Sharma --- .../agents/src/bin/ha/node/server.rs | 12 ++++- .../csi-driver/src/bin/node/dev/nvmf.rs | 28 +++++++++++- .../csi-driver/src/bin/node/main_.rs | 44 ++++++++++++++++++- control-plane/csi-driver/src/bin/node/node.rs | 7 +++ nix/pkgs/control-plane/cargo-project.nix | 3 +- nix/pkgs/images/default.nix | 4 +- shell.nix | 1 + 7 files changed, 93 insertions(+), 6 deletions(-) diff --git a/control-plane/agents/src/bin/ha/node/server.rs b/control-plane/agents/src/bin/ha/node/server.rs index 43eabf6e4..0f851db51 100755 --- a/control-plane/agents/src/bin/ha/node/server.rs +++ b/control-plane/agents/src/bin/ha/node/server.rs @@ -135,7 +135,7 @@ async fn disconnect_controller( // changed, for example due to interface name modified in io-engine args. let same_controller = is_same_controller(&new_path_uri, &subsystem)?; if same_controller { - tracing::info!(path, "Not disconnecting same NVMe controller"); + tracing::info!(path, "Not disconnecting same NVMe controller. old subsys {subsystem:?}, new path {new_path_uri:?}"); Ok(None) } else { tracing::info!(path, "Disconnecting NVMe controller"); @@ -476,6 +476,16 @@ struct ParsedUri { nqn: String, } +impl std::fmt::Debug for ParsedUri { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ParsedUri") + .field("host", &self.host()) + .field("port", &self.port()) + .field("transport", &self.transport()) + .field("nqn", &self.nqn()) + .finish() + } +} impl ParsedUri { fn new(uri: Uri) -> Result { let host = uri.host().ok_or(SvcError::InvalidArguments {})?.to_string(); diff --git a/control-plane/csi-driver/src/bin/node/dev/nvmf.rs b/control-plane/csi-driver/src/bin/node/dev/nvmf.rs index 8d73943af..c9549f980 100644 --- a/control-plane/csi-driver/src/bin/node/dev/nvmf.rs +++ b/control-plane/csi-driver/src/bin/node/dev/nvmf.rs @@ -14,6 +14,7 @@ use csi_driver::PublishParams; use glob::glob; use nvmeadm::nvmf_subsystem::Subsystem; use regex::Regex; +use tracing::warn; use udev::{Device, Enumerator}; use url::Url; use uuid::Uuid; @@ -22,6 +23,7 @@ use crate::{ config::{config, NvmeConfig, NvmeParseParams}, dev::util::extract_uuid, match_dev::match_nvmf_device, + node::RDMA_CONNECT_CHECK, }; use super::{Attach, Detach, DeviceError, DeviceName}; @@ -307,6 +309,16 @@ pub(crate) fn check_nvme_tcp_module() -> Result<(), std::io::Error> { Ok(()) } +/// Check for the presence of nvme tcp kernel module. +/// TODO: Handle the case where this(and for that matter nvme_tcp too) +/// could be builtin module. +#[allow(unused)] +pub(crate) fn check_nvme_rdma_module() -> Result<(), std::io::Error> { + let path = "/sys/module/nvme_rdma"; + std::fs::metadata(path)?; + Ok(()) +} + /// Set the nvme_core module IO timeout /// (note, this is a system-wide parameter) pub(crate) fn set_nvmecore_iotimeout(io_timeout_secs: u32) -> Result<(), std::io::Error> { @@ -359,5 +371,19 @@ pub(crate) fn transport_from_url(url: &Url) -> Result { .split('+') .nth(1) .unwrap_or(default_xprt.as_str()); - TrType::from_str(xprt).map_err(|e| DeviceError::new(format!("{e:?}").as_str())) + + let ret_xprt = TrType::from_str(xprt).map_err(|e| DeviceError::new(format!("{e:?}").as_str())); + let connect_cap_check = RDMA_CONNECT_CHECK.get().unwrap_or(&(false, false)); + + if !connect_cap_check.0 { + ret_xprt + } else { + match ret_xprt { + Ok(t) if t == TrType::rdma && !connect_cap_check.1 => { + warn!("rdma incapable node, connecting over tcp"); + Ok(TrType::tcp) + } + _else => _else, + } + } } diff --git a/control-plane/csi-driver/src/bin/node/main_.rs b/control-plane/csi-driver/src/bin/node/main_.rs index 7898c8d7f..81311c112 100644 --- a/control-plane/csi-driver/src/bin/node/main_.rs +++ b/control-plane/csi-driver/src/bin/node/main_.rs @@ -9,7 +9,7 @@ use crate::{ identity::Identity, k8s::patch_k8s_node, mount::probe_filesystems, - node::Node, + node::{Node, RDMA_CONNECT_CHECK}, nodeplugin_grpc::NodePluginGrpcServer, nodeplugin_nvme::NvmeOperationsSvc, registration::run_registration_loop, @@ -221,6 +221,16 @@ pub(super) async fn main() -> anyhow::Result<()> { .value_parser(clap::value_parser!(bool)) .help("Enable ansi color for logs") ) + .arg( + Arg::new("nvme-connect-fallback") + .long("nvme-connect-fallback") + .default_value("false") + .value_parser(clap::value_parser!(bool)) + .help( + "Enable falling back to nvme connect over tcp if initiator node is not rdma capable, \n\ + even though volume target is rdma capable." + ) + ) .subcommand( clap::Command::new("fs-freeze") .arg( @@ -327,6 +337,38 @@ pub(super) async fn main() -> anyhow::Result<()> { check_ana_and_label_node(&kube_client, node_name, nvme_enabled).await?; } + let conn_fallback = matches.get_flag("nvme-connect-fallback"); + // Check and store if this initiator node has an RDMA device so that nvme connect over RDMA + // can be done. If this node is also an io-engine node with rdma enabled and working, + // then this check will always set capability true. + let ibv_output = tokio::process::Command::new("ibv_devinfo") + .arg("-l") + .output() + .await; + + let cap = match ibv_output { + Ok(okstdout) if okstdout.status.success() => { + // In an unlikely case string from utf8 fails, err towards incapability. + let outstr = String::from_utf8(okstdout.stdout).unwrap_or("0 HCA".to_string()); + if !outstr.starts_with("0 HCA") { + info!("host node rdma capable. conn_fallback({conn_fallback:?}), {outstr:?}"); + // todo: check for the presence of nvme_rdma too. But don't bail out as we do for + // nvme_tcp. + true + } else { + info!( + "host node is not rdma capable. conn_fallback({conn_fallback:?}), {outstr:?}" + ); + false + } + } + Ok(_) | Err(_) => { + error!("Error executing ibv_devinfo, or invalid command output. conn_fallback({conn_fallback:?}), {ibv_output:?}"); + false + } + }; + let _ = RDMA_CONNECT_CHECK.set((conn_fallback, cap)); + // Parse the CSI socket file name from the command line arguments. let csi_socket = matches .get_one::("csi-socket") diff --git a/control-plane/csi-driver/src/bin/node/node.rs b/control-plane/csi-driver/src/bin/node/node.rs index 8a07243a1..8e968b236 100644 --- a/control-plane/csi-driver/src/bin/node/node.rs +++ b/control-plane/csi-driver/src/bin/node/node.rs @@ -10,6 +10,7 @@ use csi_driver::{ filesystem::FileSystem as Fs, limiter::VolumeOpGuard, }; +use once_cell::sync::OnceCell; use rpc::{ csi, csi::{ @@ -76,6 +77,12 @@ impl Node { const ATTACH_TIMEOUT_INTERVAL: Duration = Duration::from_millis(100); const ATTACH_RETRIES: u32 = 100; +// A type and static variable used to check and set node's rdma capability during startup. +// This is used to decide if initiator can indeed connect over rdma. +// First index bool tells if we need to fallback to tcp connect in case initiator node is +// not rdma capable. Second index bool is the actual capability indicator. +type CheckAndFallbackNvmeConnect = (bool, bool); +pub(crate) static RDMA_CONNECT_CHECK: OnceCell = OnceCell::new(); // Determine if given access mode in conjunction with ro mount flag makes // sense or not. If access mode is not supported or the combination does diff --git a/nix/pkgs/control-plane/cargo-project.nix b/nix/pkgs/control-plane/cargo-project.nix index 492b94d4b..04bee7fe2 100644 --- a/nix/pkgs/control-plane/cargo-project.nix +++ b/nix/pkgs/control-plane/cargo-project.nix @@ -12,6 +12,7 @@ , gitVersions , openapi-generator , which +, rdma-core , systemdMinimal , utillinux , sourcer @@ -71,7 +72,7 @@ let inherit LIBCLANG_PATH PROTOC PROTOC_INCLUDE; nativeBuildInputs = [ clang pkg-config openapi-generator which git llvmPackages.bintools ]; - buildInputs = [ llvmPackages.libclang protobuf openssl.dev systemdMinimal.dev utillinux.dev ]; + buildInputs = [ llvmPackages.libclang protobuf openssl.dev systemdMinimal.dev utillinux.dev rdma-core ]; doCheck = false; }; release_build = { "release" = true; "debug" = false; }; diff --git a/nix/pkgs/images/default.nix b/nix/pkgs/images/default.nix index 6981612c8..74d2edfec 100644 --- a/nix/pkgs/images/default.nix +++ b/nix/pkgs/images/default.nix @@ -2,7 +2,7 @@ # avoid dependency on docker tool chain. Though the maturity of OCI # builder in nixpkgs is questionable which is why we postpone this step. -{ pkgs, xfsprogs_5_16, busybox, dockerTools, lib, e2fsprogs, btrfs-progs, utillinux, fetchurl, control-plane, tini, sourcer, img_tag ? "", img_org ? "", img_prefix }: +{ pkgs, xfsprogs_5_16, rdma-core, busybox, dockerTools, lib, e2fsprogs, btrfs-progs, utillinux, fetchurl, control-plane, tini, sourcer, img_tag ? "", img_org ? "", img_prefix }: let repo-org = if img_org != "" then img_org else "${builtins.readFile (pkgs.runCommand "repo_org" { buildInputs = with pkgs; [ git ]; @@ -104,7 +104,7 @@ let inherit buildType; name = "node"; config = { - Env = [ "PATH=${lib.makeBinPath [ "/" xfsprogs e2fsprogs_1_46_2 btrfs-progs utillinux ]}" ]; + Env = [ "PATH=${lib.makeBinPath [ "/" xfsprogs rdma-core e2fsprogs_1_46_2 btrfs-progs utillinux ]}" ]; }; }; }; diff --git a/shell.nix b/shell.nix index b1dc71953..470891733 100644 --- a/shell.nix +++ b/shell.nix @@ -39,6 +39,7 @@ mkShell { pre-commit python3 utillinux + rdma-core which ] ++ pkgs.lib.optional (!norust) rust ++ pkgs.lib.optionals (system != "aarch64-darwin") [