Merge pull request #3 from molssi-seamm/dev

Improved fitting of curves
molssi-seamm · Jul 4, 2024 · dcdd84c · dcdd84c
2 parents 056f39e + 56a77c4
commit dcdd84c
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 59 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,6 +1,11 @@
 =======
 History
 =======
+2024.7.4 -- Improved fitting of curves
+    * Removed weighting of the fit by the stdev since it is too biased to the beginning
+    * Added control over the portion of the data to fit in order to avoid the initial
+      curvature and poor data towards the end.
+
 2024.6.3 -- Bugfix: handling of options for subflowchart
     * Fixed a bug where the options for the subflowchart were not being parsed
       correctly.

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -11,6 +11,7 @@ dependencies:
   # SEAMM
   - molsystem
   - numpy
+  - rdkit
   - scipy
   - seamm
   - seamm-util

diff --git a/diffusivity_step/analysis.py b/diffusivity_step/analysis.py
@@ -129,7 +129,7 @@ def create_helfand_moments(v, species, m=None):
     return Ms, M_errs
 
 
-def fit_helfand_moment(y, xs, sigma=None, start=1):
+def fit_helfand_moment(y, xs, sigma=None, start=0.1, end=0.9):
     """Find the best linear fit.
 
     Parameters
@@ -143,6 +143,12 @@ def fit_helfand_moment(y, xs, sigma=None, start=1):
     sigma : [float] or numpy.ndarray()
         Optional standard error of y
 
+    start : float
+        Fraction of vector to ignore at the beginning
+
+    end : float
+        The fraction of vector to end at
+
     Returns
     -------
     slope : float
@@ -154,33 +160,39 @@ def fit_helfand_moment(y, xs, sigma=None, start=1):
     ys : [float]
         The y values for the fit curve.
     """
-    dx = xs[1] - xs[0]
-
-    # We know the curves curve near the origin, so ignore the first part
-    i = int(start / dx)
-    if i > len(y):
-        i = int(1 / dx)
-
-    popt, pcov, infodict, msg, ierr = curve_fit(
-        axb,
-        xs[i:],
-        y[i:],
-        full_output=True,
-        sigma=sigma[i:],
-        absolute_sigma=True,
-    )
+    # We know the curves curve near the origin, so ignore the first part and last
+    n = len(y)
+    i = int(n * start)
+    j = int(n * end)
+
+    if sigma is None:
+        popt, pcov, infodict, msg, ierr = curve_fit(
+            axb,
+            xs[i:j],
+            y[i:j],
+            full_output=True,
+        )
+    else:
+        popt, pcov, infodict, msg, ierr = curve_fit(
+            axb,
+            xs[i:j],
+            y[i:j],
+            full_output=True,
+            sigma=sigma[i:j],
+            absolute_sigma=True,
+        )
     slope = float(popt[0])
     b = float(popt[1])
     err = float(np.sqrt(np.diag(pcov)[0]))
 
     ys = []
-    for x in xs[i:]:
+    for x in xs[i:j]:
         ys.append(axb(x, slope, b))
 
-    return slope, err, xs[i:], ys
+    return slope, err, xs[i:j], ys
 
 
-def fit_msd(y, xs, sigma=None, start=1):
+def fit_msd(y, xs, sigma=None, start=0.1, end=0.9):
     """Find the best linear fit to longest possible segment.
 
     Parameters
@@ -194,6 +206,12 @@ def fit_msd(y, xs, sigma=None, start=1):
     sigma : [float] or numpy.ndarray()
         Optional standard error of y
 
+    start : float
+        Fraction of vector to ignore at the beginning
+
+    end : float
+        The fraction of vector to end at
+
     Returns
     -------
     slope : float
@@ -205,30 +223,36 @@ def fit_msd(y, xs, sigma=None, start=1):
     ys : [float]
         The y values for the fit curve.
     """
-    dx = xs[1] - xs[0]
-
-    # We know the curves curve near the origin, so ignore the first part
-    i = int(start / dx)
-    if i > len(y):
-        i = int(1 / dx)
-
-    popt, pcov, infodict, msg, ierr = curve_fit(
-        axb,
-        xs[i:],
-        y[i:],
-        full_output=True,
-        sigma=sigma[i:],
-        absolute_sigma=True,
-    )
+    # We know the curves curve near the origin, so ignore the first part and last
+    n = len(y)
+    i = int(n * start)
+    j = int(n * end)
+
+    if sigma is None:
+        popt, pcov, infodict, msg, ierr = curve_fit(
+            axb,
+            xs[i:j],
+            y[i:j],
+            full_output=True,
+        )
+    else:
+        popt, pcov, infodict, msg, ierr = curve_fit(
+            axb,
+            xs[i:j],
+            y[i:j],
+            full_output=True,
+            sigma=sigma[i:],
+            absolute_sigma=True,
+        )
     slope = float(popt[0])
     b = float(popt[1])
     err = float(np.sqrt(np.diag(pcov)[0]))
 
     ys = []
-    for x in xs[i:]:
+    for x in xs[i:j]:
         ys.append(axb(x, slope, b))
 
-    return slope, err, xs[i:], ys
+    return slope, err, xs[i:j], ys
 
 
 def add_helfand_trace(

diff --git a/diffusivity_step/diffusivity.py b/diffusivity_step/diffusivity.py
@@ -201,7 +201,15 @@ def species(self):
             self._species = self.parse_molecules()
         return self._species
 
-    def analyze(self, indent="", P=None, style="full", run=None, **kwargs):
+    def analyze(
+        self,
+        indent="",
+        P=None,
+        style="full",
+        run=None,
+        weighted=False,
+        **kwargs,
+    ):
         """Do any analysis of the output from this step.
 
         Also print important results to the local step.out file using
@@ -211,6 +219,8 @@ def analyze(self, indent="", P=None, style="full", run=None, **kwargs):
         ----------
         indent: str
             An extra indentation for the output
+        weighted : bool
+            Whether to use the stderr to wieght the fit, default False
         """
         data = {}
 
@@ -263,13 +273,21 @@ def analyze(self, indent="", P=None, style="full", run=None, **kwargs):
                 # Convert units and remember the factor of 1/6 in the Einstein equation
                 factor = Q_("Å^2/ps").m_as("m^2/s") / 6
                 for i in range(nalpha):
-                    start = 1
-                    slope, err, xs, ys = fit_msd(
-                        self._msd[spec][:, i],
-                        ts,
-                        sigma=self._msd_err[spec][:, i],
-                        start=start,
-                    )
+                    if weighted:
+                        slope, err, xs, ys = fit_msd(
+                            self._msd[spec][:, i],
+                            ts,
+                            sigma=self._msd_err[spec][:, i],
+                            start=P["msd_fit_start"],
+                            end=P["msd_fit_end"],
+                        )
+                    else:
+                        slope, err, xs, ys = fit_msd(
+                            self._msd[spec][:, i],
+                            ts,
+                            start=P["msd_fit_start"],
+                            end=P["msd_fit_end"],
+                        )
                     d_coeff = slope * factor
                     err = err * factor
                     if self._scale is None:
@@ -370,13 +388,21 @@ def analyze(self, indent="", P=None, style="full", run=None, **kwargs):
                 # Fit the slopes
                 fit = []
                 for i in range(nalpha):
-                    start = 1
-                    slope, err, xs, ys = fit_msd(
-                        self._M[spec][:, i],
-                        ts,
-                        sigma=self._M_err[spec][:, i],
-                        start=start,
-                    )
+                    if weighted:
+                        slope, err, xs, ys = fit_msd(
+                            self._M[spec][:, i],
+                            ts,
+                            sigma=self._M_err[spec][:, i],
+                            start=P["helfand_fit_start"],
+                            end=P["helfand_fit_end"],
+                        )
+                    else:
+                        slope, err, xs, ys = fit_msd(
+                            self._M[spec][:, i],
+                            ts,
+                            start=P["helfand_fit_start"],
+                            end=P["helfand_fit_end"],
+                        )
                     d_coeff = slope
                     err = err
                     if self._scale is None:
@@ -585,15 +611,16 @@ def description_text(self, P=None, short=False):
                 f"Calculate the diffusivity using the {P['approach']} approach, "
                 f"averaging over {P['nruns']} runs.\n\n"
             )
-        if not short:
-            # The subflowchart
-            self.subflowchart.root_directory = self.flowchart.root_directory
 
-            # Make sure that the subflowchart has the executor
-            self.subflowchart.executor = self.flowchart.executor
+        # Make sure the subflowchart has the data from the parent flowchart
+        self.subflowchart.root_directory = self.flowchart.root_directory
+        self.subflowchart.executor = self.flowchart.executor
+        self.subflowchart.in_jobserver = self.subflowchart.in_jobserver
 
+        if not short:
             # Get the first real node
             node = self.subflowchart.get_node("1").next()
+            node.all_options = self.all_options
 
             while node is not None:
                 try:
@@ -640,6 +667,9 @@ def run(self):
 
         # Remember the configuration
         _, self._configuration = self.get_system_configuration()
+        # Reset parameters if called in e.g. loop
+        self._n_molecules = None
+        self._species = None
 
         # Decide what to do
         approach = P["approach"]
@@ -692,6 +722,11 @@ def run(self):
                 out_level = out_handler.level
                 out_handler.setLevel(printing.JOB)
 
+        # Make sure the subflowchart has the data from the parent flowchart
+        self.subflowchart.root_directory = self.flowchart.root_directory
+        self.subflowchart.executor = self.flowchart.executor
+        self.subflowchart.in_jobserver = self.subflowchart.in_jobserver
+
         # Get the first real node
         first_node = self.subflowchart.get_node("1").next()
 

diff --git a/diffusivity_step/diffusivity_parameters.py b/diffusivity_step/diffusivity_parameters.py
@@ -74,7 +74,7 @@ class DiffusivityParameters(seamm.Parameters):
 
     parameters = {
         "approach": {
-            "default": "Mean Square Displacement (MSD)",
+            "default": "both",
             "kind": "enum",
             "default_units": "",
             "enumeration": (
@@ -87,14 +87,50 @@ class DiffusivityParameters(seamm.Parameters):
             "help_text": "The approach or method for determining the diffusivity.",
         },
         "nruns": {
-            "default": "20",
+            "default": "1",
             "kind": "integer",
             "default_units": "",
             "enumeration": tuple(),
             "format_string": "",
             "description": "Number of runs to average:",
             "help_text": "The number for separate runs to average.",
         },
+        "msd_fit_start": {
+            "default": "0.1",
+            "kind": "float",
+            "default_units": "",
+            "enumeration": tuple(),
+            "format_string": "",
+            "description": "Start point of MSD fit (0.0..1.0):",
+            "help_text": "Starting point of fit the MSD this far into the data.",
+        },
+        "msd_fit_end": {
+            "default": "0.8",
+            "kind": "float",
+            "default_units": "",
+            "enumeration": tuple(),
+            "format_string": "",
+            "description": "End point of MSD fit (0.0..1.0):",
+            "help_text": "End point of fit the MSD this far into the data.",
+        },
+        "helfand_fit_start": {
+            "default": "0.2",
+            "kind": "float",
+            "default_units": "",
+            "enumeration": tuple(),
+            "format_string": "",
+            "description": "Start point of Helfand fit (0.0..1.0):",
+            "help_text": "Starting point of Helfand fit this far into the data.",
+        },
+        "helfand_fit_end": {
+            "default": "0.95",
+            "kind": "float",
+            "default_units": "",
+            "enumeration": tuple(),
+            "format_string": "",
+            "description": "End point of Helfand fit (0.0..1.0):",
+            "help_text": "End point of Helfnad fit this far into the data.",
+        },
         "errors": {
             "default": "continue to next run",
             "kind": "string",

diff --git a/diffusivity_step/tk_diffusivity.py b/diffusivity_step/tk_diffusivity.py
@@ -210,7 +210,7 @@ def reset_diffusivity_frame(self, widget=None):
         """Layout the widgets in the diffusivity frame
         as needed for the current state"""
 
-        # approach = self["approach"].get()
+        approach = self["approach"].get()
 
         frame = self["diffusivity frame"]
         for slave in frame.grid_slaves():
@@ -224,6 +224,16 @@ def reset_diffusivity_frame(self, widget=None):
             self[key].grid(row=row, column=0, columnspan=2, sticky=tk.W)
             widgets.append(self[key])
             row += 1
+        if approach == "both" or "MSD" in approach:
+            for key in ("msd_fit_start", "msd_fit_end"):
+                self[key].grid(row=row, column=0, columnspan=2, sticky=tk.W)
+                widgets.append(self[key])
+                row += 1
+        if approach == "both" or "Helfand" in approach:
+            for key in ("helfand_fit_start", "helfand_fit_end"):
+                self[key].grid(row=row, column=0, columnspan=2, sticky=tk.W)
+                widgets.append(self[key])
+                row += 1
 
         sw.align_labels(widgets, sticky=tk.E)
         frame.columnconfigure(0, minsize=50)