Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ipsuite/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ from .configuration_selection import (
)

# Data
from .data_loading import AddData, AddDataH5MD
from .data_loading import AddData, AddDataGMX, AddDataH5MD

# Datasets
from .datasets import MD22Dataset
Expand Down Expand Up @@ -156,6 +156,7 @@ __all__ = [
# Data
"AddData",
"AddDataH5MD",
"AddDataGMX",
# Datasets
"MD22Dataset",
# Bootstrap
Expand Down
3 changes: 2 additions & 1 deletion ipsuite/data_loading/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""ipsuite data loading module."""

from ipsuite.data_loading.add_data_ase import AddData
from ipsuite.data_loading.add_data_gromacs import AddDataGMX
from ipsuite.data_loading.add_data_h5md import AddDataH5MD

__all__ = ["AddData", "AddDataH5MD"]
__all__ = ["AddData", "AddDataH5MD", "AddDataGMX"]
297 changes: 297 additions & 0 deletions ipsuite/data_loading/add_data_gromacs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
import logging
import typing
import warnings
from pathlib import Path

import h5py
import MDAnalysis as mda
import numpy as np
import tqdm
import znh5md
import zntrack
from ase import Atoms
from ase.calculators.singlepoint import SinglePointCalculator
from ase.units import kJ, mol
from MDAnalysis.auxiliary.EDR import EDRReader

logger = logging.getLogger(__name__)

_TYPE_TO_ELEMENT = {
"CL": "Cl",
"NA": "Na",
"MG": "Mg",
"ZN": "Zn",
"FE": "Fe",
"CA": "Ca",
"MN": "Mn",
"CU": "Cu",
"LI": "Li",
"AL": "Al",
"SI": "Si",
"BR": "Br",
"SE": "Se",
}


def _get_symbols(u: mda.Universe) -> list[str]:
"""
Produce a list of element symbols for the atoms in an MDAnalysis Universe by using
available per-atom metadata and sensible fallbacks.

Parameters:
u (mda.Universe): MDAnalysis Universe containing atoms to derive symbols for.

Returns:
list[str]: Element symbols (e.g., "C", "Cl", "Na") for each atom
in the Universe in atom order.
"""
# 1. Use elements attribute if available
try:
return list(u.atoms.elements)
except (mda.exceptions.NoDataError, AttributeError):
pass

# 2. Use atom types (usually cleaner than names for CHARMM-GUI)
types = u.atoms.types
symbols = []
for t in types:
t_upper = t.upper()
if t_upper in _TYPE_TO_ELEMENT:
symbols.append(_TYPE_TO_ELEMENT[t_upper])
elif len(t) <= 2 and t[0].isalpha():
# Capitalize properly: first letter upper, rest lower
symbols.append(t[0].upper() + t[1:].lower() if len(t) > 1 else t.upper())
else:
# Last resort: take leading alphabetic characters from atom name
symbols.append(t[0].upper())
return symbols
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def _match_edr_frame(
edr_times: np.ndarray, traj_time: float, tolerance: float = 0.1
) -> int:
"""Find the EDR index closest to a trajectory time, warning on large gaps."""
# Use binary search to find insertion point
insert_idx = np.searchsorted(edr_times, traj_time)

# Determine nearest neighbor (left or right)
if insert_idx == 0:
idx = 0
elif insert_idx == len(edr_times):
idx = len(edr_times) - 1
else:
# Compare distances to left and right neighbors
left_diff = abs(traj_time - edr_times[insert_idx - 1])
right_diff = abs(edr_times[insert_idx] - traj_time)
idx = insert_idx - 1 if left_diff <= right_diff else insert_idx

time_diff = abs(edr_times[idx] - traj_time)
if time_diff > tolerance:
raise ValueError(
f"EDR time {edr_times[idx]:.3f} ps does not match trajectory time "
f"{traj_time:.3f} ps (diff={time_diff:.3f} ps, tolerance={tolerance:.3f} ps)"
)
return idx
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def gmx_to_ase(
topology: str,
trajectory: str | None = None,
edr: str | None = None,
start: int | None = None,
stop: int | None = None,
step: int | None = None,
) -> list[Atoms]:
"""Convert a GROMACS trajectory to a list of ASE Atoms objects.

Extracts all available information: positions, velocities, forces,
and (via the .edr file) energies and stress.

Parameters
----------
topology : str
Path to a GROMACS topology/structure file (.gro, .tpr).
trajectory : str | None
Path to a trajectory file (.xtc, .trr). If None, only the single
structure from the topology file is returned.
edr : str | None
Path to a GROMACS energy file (.edr). If given, per-frame energies
and stress tensors are attached via SinglePointCalculator.
start, stop, step : int | None
Slice parameters for selecting a subset of frames.

Returns
-------
list[Atoms]
One ASE Atoms object per frame. Each Atoms has:
- positions (always)
- cell and pbc (always)
- velocities (if present in trajectory)
- forces (if present in trajectory, e.g. .trr)
- calculator with energy/stress/forces (if .edr provided or forces
present), plus all EDR terms stored in calc.results
"""
if trajectory is not None:
u = mda.Universe(topology, trajectory)
else:
u = mda.Universe(topology)

symbols = _get_symbols(u)

# Load EDR data if provided
edr_data = None
if edr is not None:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = EDRReader(edr)
edr_all = reader.get_data(list(reader.terms))
edr_times = edr_all.pop("Time")
edr_data = dict(edr_all)
edr_terms = list(edr_data.keys())

frames = []
for ts in tqdm.tqdm(u.trajectory[start:stop:step]):
positions = ts.positions.copy()
box = ts.dimensions

atoms = Atoms(symbols=symbols, positions=positions, pbc=True)

if box is not None and all(box[:3] > 0):
atoms.set_cell(box, scale_atoms=False)
Comment on lines +154 to +160

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

pbc=True is hardcoded even when there is no valid cell.

Atoms are created with pbc=True before checking whether box is valid. For non-periodic systems (vacuum, or dimensions returning zeros), this produces inconsistent state: PBC enabled with no cell. Tie pbc to whether the cell was actually set.

Proposed fix
-        atoms = Atoms(symbols=symbols, positions=positions, pbc=True)
-
-        if box is not None and all(box[:3] > 0):
-            atoms.set_cell(box, scale_atoms=False)
+        has_cell = box is not None and all(box[:3] > 0)
+        atoms = Atoms(symbols=symbols, positions=positions, pbc=has_cell)
+        if has_cell:
+            atoms.set_cell(box, scale_atoms=False)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 154 - 160, Atoms are
being constructed with pbc=True regardless of whether a valid cell exists
(ts.dimensions), causing inconsistent state; change the flow so you only enable
periodic boundary conditions when a valid box is detected (use ts.dimensions ->
box and the same all(box[:3] > 0) check) — construct Atoms with pbc=False (or
omit pbc) initially and then if box is valid call atoms.set_cell(box,
scale_atoms=False) and set atoms.pbc = True (or construct with pbc=True at that
point) so pbc reflects whether the cell was actually set.


# Velocities (e.g. from .gro or .trr)
if ts.has_velocities:
# MDAnalysis: Å/ps, ASE: Å/fs -> divide by 1000
atoms.set_velocities(ts.velocities / 1000.0)
Comment on lines +162 to +165

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

🧩 Analysis chain

🌐 Web query:

ASE Atoms set_velocities expected units internal time unit ase.units.fs

💡 Result:

The ASE Atoms.set_velocities method expects velocities in ASE's internal units: length per time, where length is Ångström (Å = 1.0) and time is Å * sqrt(u / eV) ≈ 10.18 fs (since 1 fs ≈ 0.0982 in internal time units).[1][2] Velocities are dimensionless in the base system but interpreted as v_internal = Δposition / Δtime, with positions in Å and time derived from ase.units (eV=1, Å=1, u=1).[3][4][5] Kinetic energy is computed as sum(0.5 * m * v^2) in eV, confirming v has units sqrt(2*E/m) = sqrt(eV / u) / Å = 1 / time_unit.[6] To set velocities corresponding to a physical speed in fs^-1 (e.g., from VASP's Å/fs), scale by the time unit conversion: v_ase = v_physical * (1 fs / ase.units.fs) ≈ v_physical * 10.18, since ASE time unit ≈ 10.18 fs.[7][1] For example, 1 Å/fs in VASP requires ≈10.18 in ASE units to match kinetic energy.[7]

Citations:


🏁 Script executed:

# Find project configuration files to check ASE version
find . -maxdepth 2 -type f \( -name "setup.py" -o -name "setup.cfg" -o -name "pyproject.toml" -o -name "requirements.txt" -o -name "requirements*.txt" \) | head -20

Repository: zincware/IPSuite

Length of output: 77


🏁 Script executed:

# Read the actual file to see current imports and code
head -50 ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 1189


🏁 Script executed:

# Check the exact lines mentioned in the review
sed -n '155,175p' ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 820


🏁 Script executed:

# Read pyproject.toml to find ASE version pinning
cat pyproject.toml | grep -A 20 -i "ase\|dependencies"

Repository: zincware/IPSuite

Length of output: 1917


🏁 Script executed:

# Search for other uses of ase.units to see patterns
rg "from ase.units import|ase\.units\." --type py -B 2 -A 2

Repository: zincware/IPSuite

Length of output: 942


🏁 Script executed:

# Look for other velocity or momentum related code
rg "velocit|momentum|set_velocit" --type py -B 2 -A 2 | head -100

Repository: zincware/IPSuite

Length of output: 5529


🏁 Script executed:

# Search for tests related to gromacs loading
find . -name "*test*gromacs*" -o -name "*gromacs*test*" | head -10

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Check if there are any test files for data_loading
find . -path "*/test*" -name "*add_data_gromacs*" -o -path "*/test*" -name "*gromacs*"

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Look for any issue references or TODOs related to velocity
rg "TODO|FIXME|BUG|velocity.*unit|unit.*velocity" ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Check what ase.units.fs actually equals by looking at ASE documentation or imports
python3 << 'EOF'
try:
    from ase import units
    print(f"ase.units.fs = {units.fs}")
    print(f"1 / ase.units.fs = {1 / units.fs}")
    print(f"ASE time unit in fs: {1/units.fs:.4f} fs")
except Exception as e:
    print(f"Error: {e}")
EOF

Repository: zincware/IPSuite

Length of output: 89


🏁 Script executed:

# Search for tests or examples using add_data_gromacs or trajectory loading
find . -type f -name "*.py" -exec grep -l "add_data_gromacs\|gromacs\|MDAnalysis.*trajectory" {} \; | head -20

Repository: zincware/IPSuite

Length of output: 180


🏁 Script executed:

# Look for any set_velocities calls to see if there are other patterns
rg "set_velocities|get_velocities" --type py -B 3 -A 3

Repository: zincware/IPSuite

Length of output: 1320


🏁 Script executed:

# Check if there are notebooks or examples demonstrating this
find . -name "*.ipynb" -o -name "*example*" -type f | grep -E "(gromacs|traj|velocity)" | head -10

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Look at the comment in the current code more carefully - what does it claim ASE expects?
sed -n '160,170p' ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 533


Velocity unit conversion is incorrect — values are ~10× too small.

MDAnalysis stores velocities in Å/ps. The current conversion only changes this to Å/fs by dividing by 1000. However, ASE's internal velocity unit is Å / (ASE time unit) where the ASE time unit ≈ 10.18 fs (equivalently, ase.units.fs ≈ 0.0982). The current code is missing the conversion from Å/fs to ASE's internal units. This causes velocities to be ~10× too small, leading to kinetic energies that are ~100× too small (via 0.5·m·v²).

The proposed fix has the operation inverted—it multiplies by fs (a small number < 1), which would make the problem worse. The correct conversion should divide by fs:

Corrected fix
-from ase.units import kJ, mol
+from ase.units import fs, kJ, mol
@@
         if ts.has_velocities:
-            # MDAnalysis: Å/ps, ASE: Å/fs -> divide by 1000
-            atoms.set_velocities(ts.velocities / 1000.0)
+            # MDAnalysis: Å/ps -> ASE internal velocity (Å / ASE time unit)
+            atoms.set_velocities(ts.velocities / (1000.0 * fs))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 162 - 165, The
velocity conversion is wrong: MDAnalysis gives velocities in Å/ps and you must
convert to ASE's internal time units by dividing, not multiplying. In the block
using ts.has_velocities and atoms.set_velocities(ts.velocities / 1000.0), change
the conversion to divide by both 1000.0 and ASE's fs constant (ase.units.fs) so
velocities = ts.velocities / (1000.0 * ase.units.fs); ensure ase.units (or from
ase import units as ase_units) is imported and used, and remove any inversion
that multiplies by fs.


# Forces and energies via SinglePointCalculator
# MDAnalysis forces are in kJ/(mol·Å), convert to ASE units (eV/Å)
forces = ts.forces.copy() * (kJ / mol) if ts.has_forces else None
energy = None
stress = None
extra_results = {}

if edr_data is not None:
idx = _match_edr_frame(edr_times, ts.time)
energy = float(edr_data["Potential"][idx]) * (kJ / mol) # convert to eV

# Build Voigt stress from pressure tensor if available
try:
pxx = edr_data["Pres-XX"][idx]
pyy = edr_data["Pres-YY"][idx]
pzz = edr_data["Pres-ZZ"][idx]
pyz = edr_data["Pres-YZ"][idx]
pxz = edr_data["Pres-XZ"][idx]
pxy = edr_data["Pres-XY"][idx]
# GROMACS pressure in bar -> store as-is (not ASE native eV/ų)
stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy])
Comment on lines +186 to +187

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Stress units mismatch: GROMACS pressure is in bar, ASE expects eV/ų.

The comment acknowledges the unit discrepancy, but storing stress in bar breaks ASE's stress conventions. Any code calling atoms.get_stress() will receive values in the wrong units, leading to incorrect virial/pressure calculations downstream.

Convert bar to eV/ų: 1 bar = 1e-4 GPa = 6.2415e-7 eV/ų (use ase.units for precision).

Proposed fix
+from ase import units
+
+# bar -> eV/ų
+BAR_TO_EV_ANG3 = 1.0 / (units.bar / (units.eV / units.Ang**3))
+
 ...
-                # GROMACS pressure in bar -> store as-is (not ASE native eV/ų)
-                stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy])
+                # Convert GROMACS pressure (bar) to ASE stress (eV/ų)
+                stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy]) * BAR_TO_EV_ANG3

except KeyError:
pass

# Store all EDR terms for this frame
for term in edr_terms:
extra_results[term] = float(edr_data[term][idx])
Comment on lines +191 to +193

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

extra_results keeps EDR values in raw GROMACS units.

All EDR terms (kJ/mol energies, bar pressures, K temperatures, …) are written verbatim into calc.results alongside ASE-unit energy/forces/stress. Mixing unit systems on the same calculator is a footgun for downstream consumers. At minimum, document this in the function docstring; preferably namespace these (e.g., a gmx_edr dict) so they cannot collide with ASE's standard keys (energy, forces, stress, dipole, …).

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 191 - 193, The code
currently writes raw GROMACS EDR values into extra_results (via the loop over
edr_terms/edr_data), which mixes units with ASE keys; change the storage so
values go under a namespaced dict (e.g., extra_results.setdefault('gmx_edr', {})
and then set extra_results['gmx_edr'][term] = float(edr_data[term][idx])) to
avoid colliding with ASE standard keys, and update the function docstring to
state that gmx_edr contains raw GROMACS units (kJ/mol, bar, K, …). Ensure you
check/initialize the 'gmx_edr' dict before the loop and do not modify existing
ASE keys like energy/forces/stress.


if energy is not None or forces is not None:
calc = SinglePointCalculator(
atoms,
energy=energy,
forces=forces,
stress=stress,
)
calc.results.update(extra_results)
atoms.calc = calc

frames.append(atoms)

return frames


class AddDataGMX(zntrack.Node):
"""Convert GROMACS output files to ASE Atoms frames.

Reads topology, trajectory, and optionally energy (.edr) files
to produce a list of ASE Atoms with positions, velocities, forces,
energies, and stress where available.

Parameters
----------
topology : Path
Path to a GROMACS topology/structure file (.gro, .tpr).
trajectory : Path, optional
Path to a trajectory file (.xtc, .trr).
edr : Path, optional
Path to a GROMACS energy file (.edr).
start : int, optional
First frame index to read.
stop : int, optional
Last frame index (exclusive) to read.
step : int, optional
Step size for frame selection.

Examples
--------
>>> with project:
... md = ips.Gmx2Frames(
... topology="gromacs/system.gro",
... trajectory="gromacs/production.xtc",
... edr="gromacs/production.edr",
... start=1,
... )
"""
Comment on lines +232 to +241

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Docstring example uses a non-existent class name and is not testable.

The example references ips.Gmx2Frames(...), but the public symbol is AddDataGMX (also in __all__). The example would raise AttributeError. Per coding guidelines, the Node docstring must include a testable example using project and ips.

Proposed fix
     Examples
     --------
-    >>> with project:
-    ...     md = ips.Gmx2Frames(
-    ...         topology="gromacs/system.gro",
-    ...         trajectory="gromacs/production.xtc",
-    ...         edr="gromacs/production.edr",
-    ...         start=1,
-    ...     )
+    >>> project = ips.Project()
+    >>> with project:
+    ...     md = ips.AddDataGMX(
+    ...         topology="gromacs/system.gro",
+    ...         trajectory="gromacs/production.xtc",
+    ...         edr="gromacs/production.edr",
+    ...         start=1,
+    ...     )
+    >>> project.run()

As per coding guidelines: "Each Node's docstring should include a testable example using project and ips".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 232 - 241, The
docstring example incorrectly instantiates ips.Gmx2Frames which doesn't exist;
change the example to use the real public class AddDataGMX and ensure it is
shown being used with project and ips (e.g. ips.AddDataGMX(...)) so it is
testable; update the example arguments to match AddDataGMX's signature
(topology, trajectory, edr, start) and keep the surrounding "with project:"
context so doctests run correctly, referencing the AddDataGMX symbol in the
docstring.


topology: Path = zntrack.deps_path()
trajectory: Path | None = zntrack.deps_path(None)
edr: Path | None = zntrack.deps_path(None)
start: int | None = zntrack.params(None)
stop: int | None = zntrack.params(None)
step: int | None = zntrack.params(None)

frames_path: Path = zntrack.outs_path(zntrack.nwd / "frames.h5")

def run(self) -> None:
"""
Convert the configured GROMACS inputs into ASE Atoms frames and persist them
to the node's HDF5 output at self.frames_path.

The node's topology, optional trajectory, optional EDR file, and slicing
parameters (start, stop, step) are used to produce the frames which are
written to the frames_path via znh5md.
"""
data = gmx_to_ase(
topology=str(self.topology),
trajectory=str(self.trajectory) if self.trajectory else None,
edr=str(self.edr) if self.edr else None,
start=self.start,
stop=self.stop,
step=self.step,
)
frame_io = znh5md.IO(self.frames_path)
frame_io.extend(data)

@property
def frames(self) -> typing.List[Atoms]:
"""
Return all ASE `Atoms` frames stored in the node's HDF5 frames file.

Returns:
typing.List[Atoms]: A list of ASE `Atoms` objects read
from the HDF5 file at `self.frames_path`.
"""
with self.state.fs.open(self.frames_path, "rb") as f:
with h5py.File(f) as file:
return znh5md.IO(file_handle=file)[:]


if __name__ == "__main__":
# Example: load the production trajectory with energies
frames = gmx_to_ase(
"gromacs/system.gro",
"gromacs/production.xtc",
edr="gromacs/production.edr",
)
print(f"Loaded {len(frames)} frames, {len(frames[0])} atoms per frame")
print(f"Cell: {frames[0].cell.cellpar()}")
print(f"Potential energy (frame 0): {frames[0].get_potential_energy()} eV")
if len(frames) >= 2:
print(f"All EDR terms on frame 1: {list(frames[1].calc.results.keys())}")
Loading