Skip to content

Commit

Permalink
Adding more support for bz2 files, and cleaning up definition of rang…
Browse files Browse the repository at this point in the history
…es of structures.
  • Loading branch information
paulsaxe committed Aug 9, 2023
1 parent fbd7d14 commit c8c3189
Show file tree
Hide file tree
Showing 15 changed files with 2,114 additions and 126 deletions.
156 changes: 109 additions & 47 deletions read_structure_step/formats/cif/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
The cif/mmcif reader/writer
"""

import bz2
import gzip
import logging
from pathlib import Path
import time

from ..registries import register_format_checker
from ..registries import register_reader
from ..registries import set_format_metadata
from seamm_util.printing import FormattedText as __
from ...utils import parse_indices

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -56,7 +60,7 @@ def load_cif(
add_hydrogens=False,
system_db=None,
system=None,
indices="1:end",
indices="1-end",
subsequent_as_configurations=False,
system_name="from file",
configuration_name="sequential",
Expand Down Expand Up @@ -90,7 +94,7 @@ def load_cif(
system : System = None
The system to use if adding subsequent structures as configurations.
indices : str = "1:end"
indices : str = "1-end"
The generalized indices (slices, SMARTS, etc.) to select structures
from a file containing multiple structures.
Expand Down Expand Up @@ -123,21 +127,62 @@ def load_cif(
if isinstance(path, str):
path = Path(path)

path.expanduser().resolve()
path = path.expanduser().resolve()

# Get the information for progress output, if requested.
n_records = 0
with (
gzip.open(path, mode="rt")
if path.suffix == ".gz"
else bz2.open(path, mode="rt")
if path.suffix == ".bz2"
else open(path, "r")
) as fd:
for line in fd:
if line[0:5] == "data_":
n_records += 1
if printer is not None:
printer("")
printer(f" The CIF file contains {n_records} data blocks.")
last_percent = 0
t0 = time.time()
last_t = t0

# Get the indices to pick
indices = parse_indices(indices, n_records)
n_structures = len(indices)
if n_structures == 0:
return
stop = indices[-1]

configurations = []
record_no = 0
structure_no = 0
lines = []
in_block = False
block_name = ""
with open(path, "r") as fd:
with (
gzip.open(path, mode="rt")
if path.suffix == ".gz"
else bz2.open(path, mode="rt")
if path.suffix == ".bz2"
else open(path, "r")
) as fd:
for line in fd:
if line[0:5] == "data_":
logger.debug(f"Found block {line}")
if not in_block:
in_block = True
block_name = line[5:].strip()
else:
record_no += 1
if record_no > stop:
lines = []
break
if record_no not in indices:
lines = []
continue

structure_no += 1
if structure_no > 1:
if subsequent_as_configurations:
Expand Down Expand Up @@ -183,54 +228,71 @@ def load_cif(
else:
configuration.name = str(configuration_name)
logger.debug(f" added system {system_db.n_systems}: {block_name}")

if printer:
percent = int(100 * structure_no / n_structures)
if percent > last_percent:
t1 = time.time()
if t1 - last_t >= 60:
t = int(t1 - t0)
rate = structure_no / (t1 - t0)
t_left = int((n_structures - structure_no) / rate)
printer(
f"\t{structure_no:6} ({percent}%) structures read "
f"in {t} seconds. About {t_left} seconds remaining."
)
last_t = t1
last_percent = percent
block_name = line[5:].strip()
lines = []
lines.append(line)

if len(lines) > 0:
# The last block just ends at the end of the file
structure_no += 1
if structure_no > 1:
if subsequent_as_configurations:
configuration = system.create_configuration()
else:
system = system_db.create_system()
configuration = system.create_configuration()

text = configuration.from_cif_text("\n".join(lines))
logger.debug(f" added system {system_db.n_systems}: {block_name}")
if text != "":
printer("\n")
printer(__(text, indent=4 * " "))

configurations.append(configuration)

# Set the system name
if system_name is not None and system_name != "":
lower_name = str(system_name).lower()
if "from file" in lower_name:
system.name = block_name
elif "file name" in lower_name:
system.name = path.stem
elif "formula" in lower_name:
system.name = configuration.formula()[0]
elif "empirical formula" in lower_name:
system.name = configuration.formula()[1]
else:
system.name = str(system_name)

# And the configuration name
if configuration_name is not None and configuration_name != "":
lower_name = str(configuration_name).lower()
if "from file" in lower_name:
configuration.name = block_name
elif "file name" in lower_name:
configuration.name = path.stem
elif "formula" in lower_name:
configuration.name = configuration.formula()[0]
elif "empirical formula" in lower_name:
configuration.name = configuration.formula()[1]
else:
configuration.name = str(configuration_name)
record_no += 1
if record_no in indices:
structure_no += 1
if structure_no > 1:
if subsequent_as_configurations:
configuration = system.create_configuration()
else:
system = system_db.create_system()
configuration = system.create_configuration()

text = configuration.from_cif_text("\n".join(lines))
logger.debug(f" added system {system_db.n_systems}: {block_name}")
if text != "":
printer("\n")
printer(__(text, indent=4 * " "))

configurations.append(configuration)

# Set the system name
if system_name is not None and system_name != "":
lower_name = str(system_name).lower()
if "from file" in lower_name:
system.name = block_name
elif "file name" in lower_name:
system.name = path.stem
elif "formula" in lower_name:
system.name = configuration.formula()[0]
elif "empirical formula" in lower_name:
system.name = configuration.formula()[1]
else:
system.name = str(system_name)

# And the configuration name
if configuration_name is not None and configuration_name != "":
lower_name = str(configuration_name).lower()
if "from file" in lower_name:
configuration.name = block_name
elif "file name" in lower_name:
configuration.name = path.stem
elif "formula" in lower_name:
configuration.name = configuration.formula()[0]
elif "empirical formula" in lower_name:
configuration.name = configuration.formula()[1]
else:
configuration.name = str(configuration_name)

return configurations
18 changes: 17 additions & 1 deletion read_structure_step/formats/mop/obabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from openbabel import openbabel
from read_structure_step.formats.registries import register_reader
import seamm
from seamm_util import Q_
from .find_mopac import find_mopac

if "OpenBabel_version" not in globals():
Expand Down Expand Up @@ -239,6 +240,8 @@ def load_mop(
# Finally, the MOPAC test data usually has three comment lines to start, with a
# single number on the second line, which is the heat of formation calculated by
# MOPAC. If this format is found the HOF is captured.
kcal2kJ = Q_(1, "kcal").m_as("kJ")

run_mopac = False
keywords = []
description_lines = []
Expand Down Expand Up @@ -605,9 +608,10 @@ def load_mop(
key,
"float",
description="The reference energy from MOPAC",
units="kJ/mol",
noerror=True,
)
properties.put(key, energy)
properties.put(key, energy * kcal2kJ)

# Handle properties encoded in the description
if len(description_lines) == 2 and "=" in description_lines[1]:
Expand Down Expand Up @@ -651,7 +655,19 @@ def load_mop(
description=f"stderr for the {keyword}.",
noerror=True,
)
if (
"heat capacity" in keyword
or "enthalpy" in keyword
or "entropy" in keyword
):
stderr = float(stderr) * kcal2kJ
system_properties.put(new_keyword, stderr)
if (
"heat capacity" in keyword
or "enthalpy" in keyword
or "entropy" in keyword
):
value = float(value) * kcal2kJ
system_properties.put(keyword, value)
except Exception as e:
print(f"{e}: {key}")
Expand Down
Loading

0 comments on commit c8c3189

Please sign in to comment.