Merge pull request #67 from UDST/mnl_w_mct_df

New MergedChoiceTable feature: `from_df()` construction
UDST · Apr 14, 2020 · b173353 · b173353
2 parents 54c936d + f1ec684
commit b173353
Show file tree

Hide file tree

Showing 8 changed files with 134 additions and 47 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,18 +4,13 @@ python:
   - "2.7"
   - "3.5"
   - "3.6"
-
-matrix:
-  include:
-    - python: "3.7"  # temp solution until travis supports python 3.7 more cleanly
-      dist: xenial
-      sudo: true
+  - "3.7"
+  - "3.8"
 
 install:
   - pip install .
   - pip install -r requirements-dev.txt
-  - # extra tests run if urbansim is present, but it can't install with python 3.7
-  - if [ "$TRAVIS_PYTHON_VERSION" != "3.7" ]; then pip install urbansim; fi
+  - pip install orca urbansim  # extra tests run if urbansim is present
   - pip list
   - pip show choicemodels
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,12 @@
 # ChoiceModels change log
-### 0.2.2dev0 (2019-04-23)
 
-- adds a function `choicemodels.tools.parallel_lottery_choices()` to run iterative lottery choice batches in parallel rather than seqeuentially.
+### 0.2.2.dev1 (2020-04-14)
+
+- adds a `MergedChoiceTable.from_df()` as an alternative constructor
+
+### 0.2.2.dev0 (2019-04-23)
+
+- adds a function `choicemodels.tools.parallel_lottery_choices()` to run iterative lottery choice batches in parallel rather than sequentially
 
 ### 0.2.1 (2019-01-30)
 

diff --git a/choicemodels/__init__.py b/choicemodels/__init__.py
@@ -3,4 +3,4 @@
 
 from .mnl import MultinomialLogit, MultinomialLogitResults
 
-version = __version__ = '0.2.2dev0'
+version = __version__ = '0.2.2.dev1'
diff --git a/choicemodels/tools/mergedchoicetable.py b/choicemodels/tools/mergedchoicetable.py
@@ -114,12 +114,6 @@ def __init__(self, observations, alternatives, chosen_alternatives=None,
                 raise ValueError("Cannot sample without replacement with sample_size {} "
                         "and n_alts {}".format(sample_size, alternatives.shape[0]))
 
-        if (observations.index.name == None):
-            observations.index.name = 'obs_id'
-
-        if (alternatives.index.name == None):
-            alternatives.index.name = 'alt_id'
-
         # TO DO - check that dfs have unique indexes
         # TO DO - check that chosen_alternatives correspond correctly to other dfs
         # TO DO - same with weights (could join onto other tables and then split off)
@@ -130,14 +124,25 @@ def __init__(self, observations, alternatives, chosen_alternatives=None,
             observations = observations.drop(chosen_alternatives.name, axis='columns')
             chosen_alternatives.name = '_' + alternatives.index.name  # avoids conflicts
 
-        # Check for duplicate column names
-        obs_cols = list(observations.columns) + list(observations.index.names)
-        alt_cols = list(alternatives.columns) + list(alternatives.index.names)
-        dupes = set(obs_cols) & set(alt_cols)
+        # Allow missing obs and alts, to support .from_df() constructor     
+        if (observations is not None):
+
+            # Provide default names for observation and alternatives id's
+
+            if (observations.index.name == None):
+                observations.index.name = 'obs_id'
+
+            if (alternatives.index.name == None):
+                alternatives.index.name = 'alt_id'
+
+            # Check for duplicate column names
+            obs_cols = list(observations.columns) + list(observations.index.names)
+            alt_cols = list(alternatives.columns) + list(alternatives.index.names)
+            dupes = set(obs_cols) & set(alt_cols)
 
-        if len(dupes) > 0:
-            raise ValueError("Both input tables contain column {}. Please ensure "
-                             "column names are unique before merging".format(dupes))
+            if len(dupes) > 0:
+                raise ValueError("Both input tables contain column {}. Please ensure "
+                                 "column names are unique before merging".format(dupes))
 
         # Normalize weights to a pd.Series
         if (weights is not None) & isinstance(weights, str):
@@ -172,17 +177,48 @@ def __init__(self, observations, alternatives, chosen_alternatives=None,
         self.weights_2d = weights_2d
 
         # Build choice table...
+        # Allow missing obs and alts, to support .from_df() constructor     
+        if (observations is not None):
 
-        if (len(observations) == 0) or (len(alternatives) == 0):
-            self._merged_table = pd.DataFrame()
+            if (len(observations) == 0) or (len(alternatives) == 0):
+                self._merged_table = pd.DataFrame()
 
-        elif (sample_size is None):
-            self._merged_table = self._build_table_without_sampling()
+            elif (sample_size is None):
+                self._merged_table = self._build_table_without_sampling()
 
-        else:
-            self._merged_table = self._build_table()
+            else:
+                self._merged_table = self._build_table()
 
 
+    @classmethod
+    def from_df(cls, df):
+        """
+        Create a MergedChoiceTable instance from a pre-generated DataFrame.
+
+        Each chooser's rows should be contiguous. If applicable, the chosen alternative
+        should be listed first. This ordering is used by MergedChoiceTable.to_frame(),
+        and appears to be an undocumented requirement of the legacy MNL code.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            Table with a two-level MultiIndex where the first level corresponds to the
+            index of the observations and the second to the index of the alternatives.
+            May include a binary column named 'chosen' indicating observed choices.
+        
+        Returns
+        -------
+        MergedChoiceTable
+
+        """
+        obj = cls(observations = None, alternatives = None)
+        obj._merged_table = df
+
+        # TO DO: sort the dataframe so that rows are automatically in a consistent order
+
+        return obj
+
+
     def _merge_interaction_terms(self, df):
         """
         Merges interaction terms (if they exist) onto the input DataFrame. 
@@ -436,7 +472,7 @@ def observation_id_col(self):
         str
 
         """
-        return self.observations.index.name
+        return self._merged_table.index.names[0]
 
 
     @property
@@ -450,7 +486,7 @@ def alternative_id_col(self):
         str
 
         """
-        return self.alternatives.index.name
+        return self._merged_table.index.names[1]
 
 
     @property
@@ -464,7 +500,7 @@ def choice_col(self):
         str or None
 
         """
-        if (self.chosen_alternatives is not None):
+        if ('chosen' in self._merged_table.columns):
             return 'chosen'
 
         else:

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -8,7 +8,7 @@ ChoiceModels
 
 ChoiceModels is a Python library for discrete choice modeling, with utilities for sampling, simulation, and other ancillary tasks. It's part of the `Urban Data Science Toolkit <https://docs.udst.org>`__ (UDST).
 
-v0.2.2dev0, released April 23, 2019
+v0.2.2.dev1, released April 14, 2020
 
 
 Contents

diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -4,13 +4,9 @@
 with open('README.md', 'r') as f:
     long_description = f.read()
 
-with open('requirements.txt') as f:
-    install_requires = f.readlines()
-install_requires = [item.strip() for item in install_requires]
-
 setup(
     name='choicemodels',
-    version='0.2.2dev0',
+    version='0.2.2.dev1',
     description='Tools for discrete choice estimation',
     long_description=long_description,
     author='UDST',
@@ -23,8 +19,18 @@
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
         'License :: OSI Approved :: BSD License'
     ],
     packages=['choicemodels', 'choicemodels.tools'],
-    install_requires=install_requires
+    install_requires=[
+        'numpy >= 1.14',
+        'pandas >= 0.23',
+        'patsy >= 0.5',
+        'pylogit >= 0.2.2',
+        'scipy >= 1.0',
+        'statsmodels >= 0.8, <0.11; python_version <"3.6"',
+        'statsmodels >= 0.8; python_version >="3.6"'
+    ]
 )
diff --git a/tests/test_mct.py b/tests/test_mct.py
@@ -228,3 +228,55 @@ def test_join_key_name_conflict(obs, alts):
     MergedChoiceTable(obs, alts, chosen_alternatives=alts.index.name)
 
 
+def test_obs_id_property(obs, alts):
+    """
+    Observation id should be available for a merged table.
+    
+    """
+    mct = choicemodels.tools.MergedChoiceTable(obs, alts, 
+                                 sample_size = 2,
+                                 chosen_alternatives = 'choice')
+
+    assert(mct.observation_id_col == 'oid')
+
+
+def test_alt_id_property(obs, alts):
+    """
+    Alternative id should be available for a merged table.
+    
+    """
+    mct = choicemodels.tools.MergedChoiceTable(obs, alts, 
+                                 sample_size = 2,
+                                 chosen_alternatives = 'choice')
+
+    assert(mct.alternative_id_col == 'aid')
+
+
+def test_choice_col_property(obs, alts):
+    """
+    Choice column property should be present if applicable, or None.
+    
+    """
+    mct = choicemodels.tools.MergedChoiceTable(obs, alts, 
+                                 sample_size = 2,
+                                 chosen_alternatives = 'choice')
+    assert(mct.choice_col == 'chosen')
+
+    mct = choicemodels.tools.MergedChoiceTable(obs, alts, 
+                                 sample_size = 2)
+    assert(mct.choice_col == None)
+
+
+def test_from_df(obs, alts):
+    """
+    MCT creation from a dataframe should work smoothly.
+    
+    """
+    df = choicemodels.tools.MergedChoiceTable(obs, alts, 
+                                 sample_size = 2,
+                                 chosen_alternatives = 'choice').to_frame()
+
+    mct = choicemodels.tools.MergedChoiceTable.from_df(df)
+
+    assert(df.equals(mct.to_frame()))
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		from .mnl import MultinomialLogit, MultinomialLogitResults

		version = __version__ = '0.2.2dev0'
		version = __version__ = '0.2.2.dev1'