Skip to content

Commit

Permalink
BinaryCIF Import Implementation (#353)
Browse files Browse the repository at this point in the history
  • Loading branch information
JarrettSJohnson authored Jun 5, 2024
1 parent fd4e3a8 commit 6731589
Show file tree
Hide file tree
Showing 11 changed files with 683 additions and 96 deletions.
430 changes: 397 additions & 33 deletions layer2/CifFile.cpp

Large diffs are not rendered by default.

206 changes: 163 additions & 43 deletions layer2/CifFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@
#include <map>
#include <memory>
#include <vector>
#include <string>
#include <variant>

// for pymol::default_free
#include "MemoryDebug.h"

template<class... Ts>
struct overloaded : Ts... { using Ts::operator()...; };
template<class... Ts>
overloaded(Ts...) -> overloaded<Ts...>;

namespace pymol {
namespace _cif_detail {

Expand Down Expand Up @@ -44,6 +51,11 @@ template <typename T> T raw_to_typed(const char*);
class cif_data;
class cif_loop;
class cif_array;
namespace cif_detail {
struct cif_str_data;
struct bcif_data;
};
using CIFData = std::variant<cif_detail::cif_str_data, cif_detail::bcif_data>;

/**
* Class for reading CIF files.
Expand All @@ -57,7 +69,7 @@ class cif_array;
*
* Iterate over data blocks:
* @verbatim
for (auto& block : cf.datablocks()) {
for (auto& [code, block] : cf.datablocks()) {
// data_<code>
const char* code = block->code();
Expand All @@ -81,7 +93,7 @@ class cif_array;
*/
class cif_file {
std::vector<char*> m_tokens;
std::vector<cif_data> m_datablocks;
std::map<std::string, cif_data> m_datablocks;
std::unique_ptr<char, pymol::default_free> m_contents;

/**
Expand All @@ -98,6 +110,14 @@ class cif_file {
/// Parse CIF string
bool parse_string(const char*);

/**
* Parse BinaryCIF blob
* @param bytes BinaryCIF blob
* @param size Blob size
* @post datablocks() is valid
*/
bool parse_bcif(const char* bytes, std::size_t size);

protected:
/// Report a parsing error
virtual void error(const char*);
Expand All @@ -114,54 +134,112 @@ class cif_file {
cif_file(const char* filename, const char* contents = nullptr);

/// Data blocks
const std::vector<cif_data>& datablocks() const { return m_datablocks; }
const std::map<std::string, cif_data>& datablocks() const { return m_datablocks; }
};

/**
* View on a CIF data array. The viewed data is owned by the cif_file
*/
class cif_array {
friend class cif_file;

private:
enum { NOT_IN_LOOP = -1 };
using CifArrayElement = std::variant<std::int8_t, std::int16_t, std::int32_t,
std::uint8_t, std::uint16_t, std::uint32_t, float, double, std::string>;

// column index, -1 if not in loop
short col;
namespace cif_detail {
struct cif_str_array {
enum { NOT_IN_LOOP = -1 };

// pointer to either loop or single value
union {
const cif_loop * loop;
const char * value;
} pointer;
// column index, -1 if not in loop
short col;

// Raw data value or nullptr for unknown/inapplicable and `pos >= size()`
const char* get_value_raw(unsigned pos = 0) const;
// pointer to either loop or single value
union {
const cif_loop * loop;
const char * value;
} pointer;

// point this array to a loop (only for parsing)
void set_loop(const cif_loop * loop, short col_) {
col = col_;
pointer.loop = loop;
};
// Raw data value or NULL for unknown/inapplicable and `pos >= size()`
const char* get_value_raw(unsigned pos = 0) const;

// point this array to a loop (only for parsing)
void set_loop(const cif_loop * loop, short col_) {
col = col_;
pointer.loop = loop;
};

// point this array to a single value (only for parsing)
void set_value(const char * value) {
col = NOT_IN_LOOP;
pointer.value = value;
// point this array to a single value (only for parsing)
void set_value(const char * value) {
col = NOT_IN_LOOP;
pointer.value = value;
};
};
struct bcif_array {
std::vector<CifArrayElement> m_arr{};
};

/**
* Returns a typed value from a CIF data element.
* If the element is missing or inapplicable, return `d`.
* @param var CIF data element
* @param d default value
* @return typed value
*/
template <typename T> T var_to_typed(const CifArrayElement& var, const T& d)
{
if constexpr (std::is_same_v<T, const char*>) {
auto& str = std::get<std::string>(var);
return !str.empty() ? str.c_str() : d;
} else {
if (auto ptr = std::get_if<std::string>(&var); ptr && ptr->empty()) {
return d;
}
if constexpr (!std::is_same_v<T, std::string>) {
return std::visit(overloaded{[](const std::string& s) -> T {
return _cif_detail::raw_to_typed<T>(
s.c_str());
},
[](const auto& v) -> T { return v; }},
var);
}
}
return d;
}
}

/**
* View on a CIF data array. The viewed data is owned by the cif_file
*/
class cif_array {
friend class cif_file;

private:
mutable std::string m_internal_str_cache;
std::variant<cif_detail::cif_str_array, cif_detail::bcif_array> m_array;

public:
// constructor
cif_array() = default;

// constructor (only needed for EMPTY_ARRAY)
cif_array(std::nullptr_t) { set_value(nullptr); }
cif_array(std::nullptr_t) {
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
arr->set_value(nullptr);
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
arr->m_arr.clear();
}
}

cif_array(std::vector<CifArrayElement>&& arr) {
m_array = cif_detail::bcif_array{std::move(arr)};
}

/// Number of elements in this array (= number of rows in loop)
unsigned size() const;

/// True if value in ['.', '?']
bool is_missing(unsigned pos = 0) const { return !get_value_raw(pos); }
bool is_missing(unsigned pos = 0) const {
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
return !arr->get_value_raw(pos);
} else {
return false;
}
}

/// True if all values in ['.', '?']
bool is_missing_all() const;
Expand All @@ -172,8 +250,16 @@ class cif_array {
* @param d default value for unknown/inapplicable elements
*/
template <typename T> T as(unsigned pos = 0, T d = T()) const {
const char* s = get_value_raw(pos);
return s ? _cif_detail::raw_to_typed<T>(s) : d;
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
const char* s = arr->get_value_raw(pos);
return s ? _cif_detail::raw_to_typed<T>(s) : d;
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
if (pos >= arr->m_arr.size())
return d;
auto& var = arr->m_arr[pos];
return cif_detail::var_to_typed<T>(var, d);
}
return d;
}

/**
Expand All @@ -184,7 +270,25 @@ class cif_array {
* @param d default value for unknown/inapplicable elements
*/
const char* as_s(unsigned pos = 0, const char* d = "") const {
return as(pos, d);
if (std::get_if<cif_detail::cif_str_array>(&m_array)) {
return as(pos, d);
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
if (pos >= arr->m_arr.size())
return d;
if (auto str_ptr = std::get_if<std::string>(&arr->m_arr[pos])) {
return str_ptr->c_str();
}
m_internal_str_cache = std::visit([](auto&& arg) -> std::string {
if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
std::string>) {
return arg;
} else {
return std::to_string(arg);
}
}, arr->m_arr[pos]);
return m_internal_str_cache.c_str();
}
return d;
}

/// Alias for as<int>()
Expand All @@ -210,17 +314,33 @@ class cif_array {
/**
* CIF data block. The viewed data is owned by the cif_file.
*/
class cif_data {
friend class cif_file;

// data_<code>
const char* m_code = nullptr;
namespace cif_detail {
struct cif_str_data {
// data_<code>
const char* m_code = nullptr;

std::map<_cif_detail::zstring_view, cif_array> m_dict;
std::map<std::string, cif_array> m_dict_str;
std::map<_cif_detail::zstring_view, cif_detail::cif_str_data> m_saveframes;

// only needed for freeing
std::vector<std::unique_ptr<cif_loop>> m_loops;
};

using ColumnMap = std::map<std::string, std::vector<CifArrayElement>>;
using CategoryMap = std::map<std::string, ColumnMap>;
using DataBlockMap = std::map<std::string, CategoryMap>;
struct bcif_data {
std::string m_code;
std::map<std::string, std::map<std::string, cif_array>> m_dict;
};
}

std::map<_cif_detail::zstring_view, cif_array> m_dict;
std::map<_cif_detail::zstring_view, cif_data> m_saveframes;
class cif_data {
friend class cif_file;

// only needed for freeing
std::vector<std::unique_ptr<cif_loop>> m_loops;
CIFData m_data;

// generic default value
static const cif_array* empty_array();
Expand All @@ -234,7 +354,7 @@ class cif_data {
cif_data& operator=(cif_data&&) = default;

/// Block code (never nullptr)
const char* code() const { return m_code ? m_code : ""; }
const char* code() const;

// Get a pointer to array or nullptr if not found
const cif_array* get_arr(const char* key) const;
Expand All @@ -253,7 +373,7 @@ class cif_data {
}

/// Get a pointer to a save frame or nullptr if not found
const cif_data* get_saveframe(const char* code) const;
const cif_detail::cif_str_data* get_saveframe(const char* code) const;
};

} // namespace pymol
Expand Down
56 changes: 53 additions & 3 deletions layer2/CifMoleculeReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ static bond_dict_t * get_global_components_bond_dict(PyMOLGlobals * G) {
return nullptr;
}

for (const auto& datablock : cif.datablocks()) {
for (const auto& [code, datablock] : cif.datablocks()) {
read_chem_comp_bond_dict(&datablock, bond_dict);
}
}
Expand Down Expand Up @@ -2264,7 +2264,7 @@ pymol::Result<ObjectMolecule*> ObjectMoleculeReadCifStr(PyMOLGlobals * G, Object
return pymol::make_error("Parsing CIF file failed: ", cif->m_error_msg);
}

for (const auto& datablock : cif->datablocks()) {
for (const auto& [code, datablock] : cif->datablocks()) {
ObjectMolecule * obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet);

if (!obj) {
Expand Down Expand Up @@ -2330,7 +2330,7 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char *
return nullptr;
}

for (auto& item : cif.datablocks())
for (auto& [code, item] : cif.datablocks())
read_chem_comp_bond_dict(&item, *this);
}
}
Expand All @@ -2352,4 +2352,54 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char *
return nullptr;
}


///////////////////////////////////////

pymol::Result<ObjectMolecule*> ObjectMoleculeReadBCif(PyMOLGlobals* G,
ObjectMolecule* I, const char* bytes, std::size_t size, int frame,
int discrete, int quiet, int multiplex, int zoom)
{
#ifdef _PYMOL_NO_MSGPACKC
PRINTFB(G, FB_ObjectMolecule, FB_Errors)
" Error: This build has no BinaryCIF support.\n"
" Please install/enable msgpack-c.\n"
ENDFB(G);
return nullptr;
#endif

if (I) {
return pymol::Error("loading BCIF into existing object not supported, "
"please use 'create' to append to an existing object.");
}

if (multiplex > 0) {
return pymol::Error("loading BCIF with multiplex=1 not supported, please "
"use 'split_states' after loading the object.");
}

auto cif = std::make_shared<pymol::cif_file>();
cif->parse_bcif(bytes, size);

for (const auto& [code, datablock] : cif->datablocks()) {
auto obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet);
if (!obj) {
PRINTFB(G, FB_ObjectMolecule, FB_Warnings)
" BCIF-Warning: no coordinates found in data_%s\n", datablock.code() ENDFB(G);
continue;
}

#ifndef _PYMOL_NOPY
// we only provide access from the Python API so far
if (SettingGet<bool>(G, cSetting_cif_keepinmemory)) {
obj->m_cifdata = &datablock;
obj->m_ciffile = cif;
}
#endif

if (cif->datablocks().size() == 1 || multiplex == 0)
return obj;
}
return nullptr;
}

// vi:sw=2:ts=2:expandtab
Loading

0 comments on commit 6731589

Please sign in to comment.