Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 240 additions & 0 deletions ipsuite/data_loading/add_data_gromacs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import typing
import warnings
from pathlib import Path

import h5py
import MDAnalysis as mda
import numpy as np
import tqdm
import znh5md
import zntrack
from ase import Atoms
from ase.calculators.singlepoint import SinglePointCalculator
from MDAnalysis.auxiliary.EDR import EDRReader

_TYPE_TO_ELEMENT = {
"CL": "Cl",
"NA": "Na",
"MG": "Mg",
"ZN": "Zn",
"FE": "Fe",
"CA": "Ca",
"MN": "Mn",
"CU": "Cu",
"LI": "Li",
"AL": "Al",
"SI": "Si",
"BR": "Br",
"SE": "Se",
}


def _get_symbols(u: mda.Universe) -> list[str]:
"""Extract element symbols from a Universe, trying multiple strategies."""
# 1. Use elements attribute if available
try:
return list(u.atoms.elements)
except (mda.exceptions.NoDataError, AttributeError):
pass

# 2. Use atom types (usually cleaner than names for CHARMM-GUI)
types = u.atoms.types
symbols = []
for t in types:
t_upper = t.upper()
if t_upper in _TYPE_TO_ELEMENT:
symbols.append(_TYPE_TO_ELEMENT[t_upper])
elif len(t) <= 2 and t[0].isalpha():
# Capitalize properly: first letter upper, rest lower
symbols.append(t[0].upper() + t[1:].lower() if len(t) > 1 else t.upper())
else:
# Last resort: take leading alphabetic characters from atom name
symbols.append(t[0].upper())
return symbols
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def gmx_to_ase(
topology: str,
trajectory: str | None = None,
edr: str | None = None,
start: int | None = None,
stop: int | None = None,
step: int | None = None,
) -> list[Atoms]:
"""Convert a GROMACS trajectory to a list of ASE Atoms objects.

Extracts all available information: positions, velocities, forces,
and (via the .edr file) energies and stress.

Parameters
----------
topology : str
Path to a GROMACS topology/structure file (.gro, .tpr).
trajectory : str | None
Path to a trajectory file (.xtc, .trr). If None, only the single
structure from the topology file is returned.
edr : str | None
Path to a GROMACS energy file (.edr). If given, per-frame energies
and stress tensors are attached via SinglePointCalculator.
start, stop, step : int | None
Slice parameters for selecting a subset of frames.

Returns
-------
list[Atoms]
One ASE Atoms object per frame. Each Atoms has:
- positions (always)
- cell and pbc (always)
- velocities (if present in trajectory)
- forces (if present in trajectory, e.g. .trr)
- calculator with energy/stress/forces (if .edr provided or forces
present), plus all EDR terms stored in calc.results
"""
if trajectory is not None:
u = mda.Universe(topology, trajectory)
else:
u = mda.Universe(topology)

symbols = _get_symbols(u)

# Load EDR data if provided
edr_data = None
if edr is not None:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = EDRReader(edr)
edr_all = reader.get_data(list(reader.terms))
edr_times = edr_all.pop("Time")
edr_data = {term: values for term, values in edr_all.items()}
edr_terms = list(edr_data.keys())

frames = []
for ts in tqdm.tqdm(u.trajectory[start:stop:step]):
positions = ts.positions.copy()
box = ts.dimensions

atoms = Atoms(symbols=symbols, positions=positions, pbc=True)

if box is not None and all(box[:3] > 0):
atoms.set_cell(box, scale_atoms=False)
Comment on lines +154 to +160

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

pbc=True is hardcoded even when there is no valid cell.

Atoms are created with pbc=True before checking whether box is valid. For non-periodic systems (vacuum, or dimensions returning zeros), this produces inconsistent state: PBC enabled with no cell. Tie pbc to whether the cell was actually set.

Proposed fix
-        atoms = Atoms(symbols=symbols, positions=positions, pbc=True)
-
-        if box is not None and all(box[:3] > 0):
-            atoms.set_cell(box, scale_atoms=False)
+        has_cell = box is not None and all(box[:3] > 0)
+        atoms = Atoms(symbols=symbols, positions=positions, pbc=has_cell)
+        if has_cell:
+            atoms.set_cell(box, scale_atoms=False)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 154 - 160, Atoms are
being constructed with pbc=True regardless of whether a valid cell exists
(ts.dimensions), causing inconsistent state; change the flow so you only enable
periodic boundary conditions when a valid box is detected (use ts.dimensions ->
box and the same all(box[:3] > 0) check) — construct Atoms with pbc=False (or
omit pbc) initially and then if box is valid call atoms.set_cell(box,
scale_atoms=False) and set atoms.pbc = True (or construct with pbc=True at that
point) so pbc reflects whether the cell was actually set.


# Velocities (e.g. from .gro or .trr)
if ts.has_velocities:
# MDAnalysis: Å/ps, ASE: Å/fs -> divide by 1000
atoms.set_velocities(ts.velocities / 1000.0)
Comment on lines +162 to +165

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

🧩 Analysis chain

🌐 Web query:

ASE Atoms set_velocities expected units internal time unit ase.units.fs

💡 Result:

The ASE Atoms.set_velocities method expects velocities in ASE's internal units: length per time, where length is Ångström (Å = 1.0) and time is Å * sqrt(u / eV) ≈ 10.18 fs (since 1 fs ≈ 0.0982 in internal time units).[1][2] Velocities are dimensionless in the base system but interpreted as v_internal = Δposition / Δtime, with positions in Å and time derived from ase.units (eV=1, Å=1, u=1).[3][4][5] Kinetic energy is computed as sum(0.5 * m * v^2) in eV, confirming v has units sqrt(2*E/m) = sqrt(eV / u) / Å = 1 / time_unit.[6] To set velocities corresponding to a physical speed in fs^-1 (e.g., from VASP's Å/fs), scale by the time unit conversion: v_ase = v_physical * (1 fs / ase.units.fs) ≈ v_physical * 10.18, since ASE time unit ≈ 10.18 fs.[7][1] For example, 1 Å/fs in VASP requires ≈10.18 in ASE units to match kinetic energy.[7]

Citations:


🏁 Script executed:

# Find project configuration files to check ASE version
find . -maxdepth 2 -type f \( -name "setup.py" -o -name "setup.cfg" -o -name "pyproject.toml" -o -name "requirements.txt" -o -name "requirements*.txt" \) | head -20

Repository: zincware/IPSuite

Length of output: 77


🏁 Script executed:

# Read the actual file to see current imports and code
head -50 ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 1189


🏁 Script executed:

# Check the exact lines mentioned in the review
sed -n '155,175p' ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 820


🏁 Script executed:

# Read pyproject.toml to find ASE version pinning
cat pyproject.toml | grep -A 20 -i "ase\|dependencies"

Repository: zincware/IPSuite

Length of output: 1917


🏁 Script executed:

# Search for other uses of ase.units to see patterns
rg "from ase.units import|ase\.units\." --type py -B 2 -A 2

Repository: zincware/IPSuite

Length of output: 942


🏁 Script executed:

# Look for other velocity or momentum related code
rg "velocit|momentum|set_velocit" --type py -B 2 -A 2 | head -100

Repository: zincware/IPSuite

Length of output: 5529


🏁 Script executed:

# Search for tests related to gromacs loading
find . -name "*test*gromacs*" -o -name "*gromacs*test*" | head -10

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Check if there are any test files for data_loading
find . -path "*/test*" -name "*add_data_gromacs*" -o -path "*/test*" -name "*gromacs*"

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Look for any issue references or TODOs related to velocity
rg "TODO|FIXME|BUG|velocity.*unit|unit.*velocity" ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Check what ase.units.fs actually equals by looking at ASE documentation or imports
python3 << 'EOF'
try:
    from ase import units
    print(f"ase.units.fs = {units.fs}")
    print(f"1 / ase.units.fs = {1 / units.fs}")
    print(f"ASE time unit in fs: {1/units.fs:.4f} fs")
except Exception as e:
    print(f"Error: {e}")
EOF

Repository: zincware/IPSuite

Length of output: 89


🏁 Script executed:

# Search for tests or examples using add_data_gromacs or trajectory loading
find . -type f -name "*.py" -exec grep -l "add_data_gromacs\|gromacs\|MDAnalysis.*trajectory" {} \; | head -20

Repository: zincware/IPSuite

Length of output: 180


🏁 Script executed:

# Look for any set_velocities calls to see if there are other patterns
rg "set_velocities|get_velocities" --type py -B 3 -A 3

Repository: zincware/IPSuite

Length of output: 1320


🏁 Script executed:

# Check if there are notebooks or examples demonstrating this
find . -name "*.ipynb" -o -name "*example*" -type f | grep -E "(gromacs|traj|velocity)" | head -10

Repository: zincware/IPSuite

Length of output: 42


🏁 Script executed:

# Look at the comment in the current code more carefully - what does it claim ASE expects?
sed -n '160,170p' ipsuite/data_loading/add_data_gromacs.py

Repository: zincware/IPSuite

Length of output: 533


Velocity unit conversion is incorrect — values are ~10× too small.

MDAnalysis stores velocities in Å/ps. The current conversion only changes this to Å/fs by dividing by 1000. However, ASE's internal velocity unit is Å / (ASE time unit) where the ASE time unit ≈ 10.18 fs (equivalently, ase.units.fs ≈ 0.0982). The current code is missing the conversion from Å/fs to ASE's internal units. This causes velocities to be ~10× too small, leading to kinetic energies that are ~100× too small (via 0.5·m·v²).

The proposed fix has the operation inverted—it multiplies by fs (a small number < 1), which would make the problem worse. The correct conversion should divide by fs:

Corrected fix
-from ase.units import kJ, mol
+from ase.units import fs, kJ, mol
@@
         if ts.has_velocities:
-            # MDAnalysis: Å/ps, ASE: Å/fs -> divide by 1000
-            atoms.set_velocities(ts.velocities / 1000.0)
+            # MDAnalysis: Å/ps -> ASE internal velocity (Å / ASE time unit)
+            atoms.set_velocities(ts.velocities / (1000.0 * fs))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 162 - 165, The
velocity conversion is wrong: MDAnalysis gives velocities in Å/ps and you must
convert to ASE's internal time units by dividing, not multiplying. In the block
using ts.has_velocities and atoms.set_velocities(ts.velocities / 1000.0), change
the conversion to divide by both 1000.0 and ASE's fs constant (ase.units.fs) so
velocities = ts.velocities / (1000.0 * ase.units.fs); ensure ase.units (or from
ase import units as ase_units) is imported and used, and remove any inversion
that multiplies by fs.


# Forces and energies via SinglePointCalculator
forces = ts.forces.copy() if ts.has_forces else None
energy = None
stress = None
extra_results = {}

if edr_data is not None:
# Match EDR frame to trajectory time
idx = np.argmin(np.abs(edr_times - ts.time))
energy = float(edr_data["Potential"][idx]) # kJ/mol
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

# Build Voigt stress from pressure tensor if available
try:
pxx = edr_data["Pres-XX"][idx]
pyy = edr_data["Pres-YY"][idx]
pzz = edr_data["Pres-ZZ"][idx]
pyz = edr_data["Pres-YZ"][idx]
pxz = edr_data["Pres-XZ"][idx]
pxy = edr_data["Pres-XY"][idx]
# GROMACS pressure in bar -> store as-is (not ASE native eV/ų)
stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy])
Comment on lines +186 to +187

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Stress units mismatch: GROMACS pressure is in bar, ASE expects eV/ų.

The comment acknowledges the unit discrepancy, but storing stress in bar breaks ASE's stress conventions. Any code calling atoms.get_stress() will receive values in the wrong units, leading to incorrect virial/pressure calculations downstream.

Convert bar to eV/ų: 1 bar = 1e-4 GPa = 6.2415e-7 eV/ų (use ase.units for precision).

Proposed fix
+from ase import units
+
+# bar -> eV/ų
+BAR_TO_EV_ANG3 = 1.0 / (units.bar / (units.eV / units.Ang**3))
+
 ...
-                # GROMACS pressure in bar -> store as-is (not ASE native eV/ų)
-                stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy])
+                # Convert GROMACS pressure (bar) to ASE stress (eV/ų)
+                stress = np.array([pxx, pyy, pzz, pyz, pxz, pxy]) * BAR_TO_EV_ANG3

except KeyError:
pass

# Store all EDR terms for this frame
for term in edr_terms:
extra_results[term] = float(edr_data[term][idx])
Comment on lines +191 to +193

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

extra_results keeps EDR values in raw GROMACS units.

All EDR terms (kJ/mol energies, bar pressures, K temperatures, …) are written verbatim into calc.results alongside ASE-unit energy/forces/stress. Mixing unit systems on the same calculator is a footgun for downstream consumers. At minimum, document this in the function docstring; preferably namespace these (e.g., a gmx_edr dict) so they cannot collide with ASE's standard keys (energy, forces, stress, dipole, …).

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 191 - 193, The code
currently writes raw GROMACS EDR values into extra_results (via the loop over
edr_terms/edr_data), which mixes units with ASE keys; change the storage so
values go under a namespaced dict (e.g., extra_results.setdefault('gmx_edr', {})
and then set extra_results['gmx_edr'][term] = float(edr_data[term][idx])) to
avoid colliding with ASE standard keys, and update the function docstring to
state that gmx_edr contains raw GROMACS units (kJ/mol, bar, K, …). Ensure you
check/initialize the 'gmx_edr' dict before the loop and do not modify existing
ASE keys like energy/forces/stress.


if energy is not None or forces is not None:
calc = SinglePointCalculator(
atoms,
energy=energy,
forces=forces,
stress=stress,
)
calc.results.update(extra_results)
atoms.calc = calc

frames.append(atoms)

return frames


class Gmx2Frames(zntrack.Node):
"""Convert GROMACS output files to ASE Atoms frames.

Reads topology, trajectory, and optionally energy (.edr) files
to produce a list of ASE Atoms with positions, velocities, forces,
energies, and stress where available.

Parameters
----------
topology : Path
Path to a GROMACS topology/structure file (.gro, .tpr).
trajectory : Path, optional
Path to a trajectory file (.xtc, .trr).
edr : Path, optional
Path to a GROMACS energy file (.edr).
start : int, optional
First frame index to read.
stop : int, optional
Last frame index (exclusive) to read.
step : int, optional
Step size for frame selection.

Examples
--------
>>> with project:
... md = ips.Gmx2Frames(
... topology="gromacs/system.gro",
... trajectory="gromacs/production.xtc",
... edr="gromacs/production.edr",
... start=1,
... )
"""
Comment on lines +232 to +241

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Docstring example uses a non-existent class name and is not testable.

The example references ips.Gmx2Frames(...), but the public symbol is AddDataGMX (also in __all__). The example would raise AttributeError. Per coding guidelines, the Node docstring must include a testable example using project and ips.

Proposed fix
     Examples
     --------
-    >>> with project:
-    ...     md = ips.Gmx2Frames(
-    ...         topology="gromacs/system.gro",
-    ...         trajectory="gromacs/production.xtc",
-    ...         edr="gromacs/production.edr",
-    ...         start=1,
-    ...     )
+    >>> project = ips.Project()
+    >>> with project:
+    ...     md = ips.AddDataGMX(
+    ...         topology="gromacs/system.gro",
+    ...         trajectory="gromacs/production.xtc",
+    ...         edr="gromacs/production.edr",
+    ...         start=1,
+    ...     )
+    >>> project.run()

As per coding guidelines: "Each Node's docstring should include a testable example using project and ips".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ipsuite/data_loading/add_data_gromacs.py` around lines 232 - 241, The
docstring example incorrectly instantiates ips.Gmx2Frames which doesn't exist;
change the example to use the real public class AddDataGMX and ensure it is
shown being used with project and ips (e.g. ips.AddDataGMX(...)) so it is
testable; update the example arguments to match AddDataGMX's signature
(topology, trajectory, edr, start) and keep the surrounding "with project:"
context so doctests run correctly, referencing the AddDataGMX symbol in the
docstring.


topology: Path = zntrack.deps_path()
trajectory: Path = zntrack.deps_path(None)
edr: Path = zntrack.deps_path(None)
start: int = zntrack.params(None)
stop: int = zntrack.params(None)
step: int = zntrack.params(None)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

frames_path: Path = zntrack.outs_path(zntrack.nwd / "frames.h5")

def run(self):
Comment thread
MrJulEnergy marked this conversation as resolved.
Outdated
data = gmx_to_ase(
topology=str(self.topology),
trajectory=str(self.trajectory) if self.trajectory else None,
edr=str(self.edr) if self.edr else None,
start=self.start,
stop=self.stop,
step=self.step,
)
frame_io = znh5md.IO(self.frames_path)
frame_io.extend(data)

@property
def frames(self) -> typing.List[Atoms]:
with self.state.fs.open(self.frames_path, "rb") as f:
with h5py.File(f) as file:
return znh5md.IO(file_handle=file)[:]


if __name__ == "__main__":
# Example: load the production trajectory with energies
frames = gmx_to_ase(
"gromacs/system.gro",
"gromacs/production.xtc",
edr="gromacs/production.edr",
)
print(f"Loaded {len(frames)} frames, {len(frames[0])} atoms per frame")
print(f"Cell: {frames[0].cell.cellpar()}")
print(f"Potential energy (frame 0): {frames[0].get_potential_energy()} kJ/mol")
print(f"All EDR terms on frame 0: {list(frames[1].calc.results.keys())}")
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
Loading