cguccione · cameronmartino · Oct 26, 2021
diff --git a/README.md b/README.md
@@ -1 +1,5 @@
 # NeutralEvolutionModeling
+
+## install
+
+pip install -e.
diff --git a/scripts/NeuFit_Pipeline.ipynb → ipynb/NeuFit_Pipeline.ipynb b/scripts/NeuFit_Pipeline.ipynb → ipynb/NeuFit_Pipeline.ipynb
diff --git a/.../esophagus_cancer_2021-09-12_21:01:35.txt → .../esophagus_cancer_2021-09-12_21:01:35.txt b/.../esophagus_cancer_2021-09-12_21:01:35.txt → .../esophagus_cancer_2021-09-12_21:01:35.txt
diff --git a/...er_2021-09-12_21:01:35_FullNonNeutral.csv → ...er_2021-09-12_21:01:35_FullNonNeutral.csv b/...er_2021-09-12_21:01:35_FullNonNeutral.csv → ...er_2021-09-12_21:01:35_FullNonNeutral.csv
diff --git a/...er_2021-09-12_21:01:35_NeutralFitPlot.png → ...er_2021-09-12_21:01:35_NeutralFitPlot.png b/...er_2021-09-12_21:01:35_NeutralFitPlot.png → ...er_2021-09-12_21:01:35_NeutralFitPlot.png
diff --git a/...5_NeutralFitPlot_withNonNeutralColors.png → ...5_NeutralFitPlot_withNonNeutralColors.png b/...5_NeutralFitPlot_withNonNeutralColors.png → ...5_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-12_21:01:35_NonNeutral_Outliers.csv → ...21-09-12_21:01:35_NonNeutral_Outliers.csv b/...21-09-12_21:01:35_NonNeutral_Outliers.csv → ...21-09-12_21:01:35_NonNeutral_Outliers.csv
diff --git a/.../esophagus_normal_2021-09-12_20:54:39.txt → .../esophagus_normal_2021-09-12_20:54:39.txt b/.../esophagus_normal_2021-09-12_20:54:39.txt → .../esophagus_normal_2021-09-12_20:54:39.txt
diff --git a/...al_2021-09-12_20:54:39_FullNonNeutral.csv → ...al_2021-09-12_20:54:39_FullNonNeutral.csv b/...al_2021-09-12_20:54:39_FullNonNeutral.csv → ...al_2021-09-12_20:54:39_FullNonNeutral.csv
diff --git a/...al_2021-09-12_20:54:39_NeutralFitPlot.png → ...al_2021-09-12_20:54:39_NeutralFitPlot.png b/...al_2021-09-12_20:54:39_NeutralFitPlot.png → ...al_2021-09-12_20:54:39_NeutralFitPlot.png
diff --git a/...9_NeutralFitPlot_withNonNeutralColors.png → ...9_NeutralFitPlot_withNonNeutralColors.png b/...9_NeutralFitPlot_withNonNeutralColors.png → ...9_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-12_20:54:39_NonNeutral_Outliers.csv → ...21-09-12_20:54:39_NonNeutral_Outliers.csv b/...21-09-12_20:54:39_NonNeutral_Outliers.csv → ...21-09-12_20:54:39_NonNeutral_Outliers.csv
diff --git a/...r/headNeck_cancer_2021-09-12_20:57:10.txt → ...r/headNeck_cancer_2021-09-12_20:57:10.txt b/...r/headNeck_cancer_2021-09-12_20:57:10.txt → ...r/headNeck_cancer_2021-09-12_20:57:10.txt
diff --git a/...er_2021-09-12_20:57:10_FullNonNeutral.csv → ...er_2021-09-12_20:57:10_FullNonNeutral.csv b/...er_2021-09-12_20:57:10_FullNonNeutral.csv → ...er_2021-09-12_20:57:10_FullNonNeutral.csv
diff --git a/...er_2021-09-12_20:57:10_NeutralFitPlot.png → ...er_2021-09-12_20:57:10_NeutralFitPlot.png b/...er_2021-09-12_20:57:10_NeutralFitPlot.png → ...er_2021-09-12_20:57:10_NeutralFitPlot.png
diff --git a/...0_NeutralFitPlot_withNonNeutralColors.png → ...0_NeutralFitPlot_withNonNeutralColors.png b/...0_NeutralFitPlot_withNonNeutralColors.png → ...0_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-12_20:57:10_NonNeutral_Outliers.csv → ...21-09-12_20:57:10_NonNeutral_Outliers.csv b/...21-09-12_20:57:10_NonNeutral_Outliers.csv → ...21-09-12_20:57:10_NonNeutral_Outliers.csv
diff --git a/...l/headNeck_normal_2021-09-12_21:02:45.txt → ...l/headNeck_normal_2021-09-12_21:02:45.txt b/...l/headNeck_normal_2021-09-12_21:02:45.txt → ...l/headNeck_normal_2021-09-12_21:02:45.txt
diff --git a/...al_2021-09-12_21:02:45_FullNonNeutral.csv → ...al_2021-09-12_21:02:45_FullNonNeutral.csv b/...al_2021-09-12_21:02:45_FullNonNeutral.csv → ...al_2021-09-12_21:02:45_FullNonNeutral.csv
diff --git a/...al_2021-09-12_21:02:45_NeutralFitPlot.png → ...al_2021-09-12_21:02:45_NeutralFitPlot.png b/...al_2021-09-12_21:02:45_NeutralFitPlot.png → ...al_2021-09-12_21:02:45_NeutralFitPlot.png
diff --git a/...5_NeutralFitPlot_withNonNeutralColors.png → ...5_NeutralFitPlot_withNonNeutralColors.png b/...5_NeutralFitPlot_withNonNeutralColors.png → ...5_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-12_21:02:45_NonNeutral_Outliers.csv → ...21-09-12_21:02:45_NonNeutral_Outliers.csv b/...21-09-12_21:02:45_NonNeutral_Outliers.csv → ...21-09-12_21:02:45_NonNeutral_Outliers.csv
diff --git a/...combined/combined_2021-09-11_01:05:08.txt → ...combined/combined_2021-09-11_01:05:08.txt b/...combined/combined_2021-09-11_01:05:08.txt → ...combined/combined_2021-09-11_01:05:08.txt
diff --git a/...ed_2021-09-11_01:05:08_FullNonNeutral.csv → ...ed_2021-09-11_01:05:08_FullNonNeutral.csv b/...ed_2021-09-11_01:05:08_FullNonNeutral.csv → ...ed_2021-09-11_01:05:08_FullNonNeutral.csv
diff --git a/...ed_2021-09-11_01:05:08_NeutralFitPlot.png → ...ed_2021-09-11_01:05:08_NeutralFitPlot.png b/...ed_2021-09-11_01:05:08_NeutralFitPlot.png → ...ed_2021-09-11_01:05:08_NeutralFitPlot.png
diff --git a/...8_NeutralFitPlot_withNonNeutralColors.png → ...8_NeutralFitPlot_withNonNeutralColors.png b/...8_NeutralFitPlot_withNonNeutralColors.png → ...8_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_01:05:08_NonNeutral_Outliers.csv → ...21-09-11_01:05:08_NonNeutral_Outliers.csv b/...21-09-11_01:05:08_NonNeutral_Outliers.csv → ...21-09-11_01:05:08_NonNeutral_Outliers.csv
diff --git a/...rs/nonProgressors_2021-09-11_16:59:37.txt → ...rs/nonProgressors_2021-09-11_16:59:37.txt b/...rs/nonProgressors_2021-09-11_16:59:37.txt → ...rs/nonProgressors_2021-09-11_16:59:37.txt
diff --git a/...rs_2021-09-11_16:59:37_FullNonNeutral.csv → ...rs_2021-09-11_16:59:37_FullNonNeutral.csv b/...rs_2021-09-11_16:59:37_FullNonNeutral.csv → ...rs_2021-09-11_16:59:37_FullNonNeutral.csv
diff --git a/...rs_2021-09-11_16:59:37_NeutralFitPlot.png → ...rs_2021-09-11_16:59:37_NeutralFitPlot.png b/...rs_2021-09-11_16:59:37_NeutralFitPlot.png → ...rs_2021-09-11_16:59:37_NeutralFitPlot.png
diff --git a/...7_NeutralFitPlot_withNonNeutralColors.png → ...7_NeutralFitPlot_withNonNeutralColors.png b/...7_NeutralFitPlot_withNonNeutralColors.png → ...7_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_16:59:37_NonNeutral_Outliers.csv → ...21-09-11_16:59:37_NonNeutral_Outliers.csv b/...21-09-11_16:59:37_NonNeutral_Outliers.csv → ...21-09-11_16:59:37_NonNeutral_Outliers.csv
diff --git a/.../nonProgressorsT1_2021-09-11_17:01:16.txt → .../nonProgressorsT1_2021-09-11_17:01:16.txt b/.../nonProgressorsT1_2021-09-11_17:01:16.txt → .../nonProgressorsT1_2021-09-11_17:01:16.txt
diff --git a/...T1_2021-09-11_17:01:16_FullNonNeutral.csv → ...T1_2021-09-11_17:01:16_FullNonNeutral.csv b/...T1_2021-09-11_17:01:16_FullNonNeutral.csv → ...T1_2021-09-11_17:01:16_FullNonNeutral.csv
diff --git a/...T1_2021-09-11_17:01:16_NeutralFitPlot.png → ...T1_2021-09-11_17:01:16_NeutralFitPlot.png b/...T1_2021-09-11_17:01:16_NeutralFitPlot.png → ...T1_2021-09-11_17:01:16_NeutralFitPlot.png
diff --git a/...6_NeutralFitPlot_withNonNeutralColors.png → ...6_NeutralFitPlot_withNonNeutralColors.png b/...6_NeutralFitPlot_withNonNeutralColors.png → ...6_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_17:01:16_NonNeutral_Outliers.csv → ...21-09-11_17:01:16_NonNeutral_Outliers.csv b/...21-09-11_17:01:16_NonNeutral_Outliers.csv → ...21-09-11_17:01:16_NonNeutral_Outliers.csv
diff --git a/.../nonProgressorsT2_2021-09-11_17:02:10.txt → .../nonProgressorsT2_2021-09-11_17:02:10.txt b/.../nonProgressorsT2_2021-09-11_17:02:10.txt → .../nonProgressorsT2_2021-09-11_17:02:10.txt
diff --git a/...T2_2021-09-11_17:02:10_FullNonNeutral.csv → ...T2_2021-09-11_17:02:10_FullNonNeutral.csv b/...T2_2021-09-11_17:02:10_FullNonNeutral.csv → ...T2_2021-09-11_17:02:10_FullNonNeutral.csv
diff --git a/...T2_2021-09-11_17:02:10_NeutralFitPlot.png → ...T2_2021-09-11_17:02:10_NeutralFitPlot.png b/...T2_2021-09-11_17:02:10_NeutralFitPlot.png → ...T2_2021-09-11_17:02:10_NeutralFitPlot.png
diff --git a/...0_NeutralFitPlot_withNonNeutralColors.png → ...0_NeutralFitPlot_withNonNeutralColors.png b/...0_NeutralFitPlot_withNonNeutralColors.png → ...0_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_17:02:10_NonNeutral_Outliers.csv → ...21-09-11_17:02:10_NonNeutral_Outliers.csv b/...21-09-11_17:02:10_NonNeutral_Outliers.csv → ...21-09-11_17:02:10_NonNeutral_Outliers.csv
diff --git a/...ssors/progressors_2021-09-11_16:51:08.txt → ...ssors/progressors_2021-09-11_16:51:08.txt b/...ssors/progressors_2021-09-11_16:51:08.txt → ...ssors/progressors_2021-09-11_16:51:08.txt
diff --git a/...rs_2021-09-11_16:51:08_FullNonNeutral.csv → ...rs_2021-09-11_16:51:08_FullNonNeutral.csv b/...rs_2021-09-11_16:51:08_FullNonNeutral.csv → ...rs_2021-09-11_16:51:08_FullNonNeutral.csv
diff --git a/...rs_2021-09-11_16:51:08_NeutralFitPlot.png → ...rs_2021-09-11_16:51:08_NeutralFitPlot.png b/...rs_2021-09-11_16:51:08_NeutralFitPlot.png → ...rs_2021-09-11_16:51:08_NeutralFitPlot.png
diff --git a/...8_NeutralFitPlot_withNonNeutralColors.png → ...8_NeutralFitPlot_withNonNeutralColors.png b/...8_NeutralFitPlot_withNonNeutralColors.png → ...8_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_16:51:08_NonNeutral_Outliers.csv → ...21-09-11_16:51:08_NonNeutral_Outliers.csv b/...21-09-11_16:51:08_NonNeutral_Outliers.csv → ...21-09-11_16:51:08_NonNeutral_Outliers.csv
diff --git a/...sT1/progressorsT1_2021-09-11_16:56:12.txt → ...sT1/progressorsT1_2021-09-11_16:56:12.txt b/...sT1/progressorsT1_2021-09-11_16:56:12.txt → ...sT1/progressorsT1_2021-09-11_16:56:12.txt
diff --git a/...T1_2021-09-11_16:56:12_FullNonNeutral.csv → ...T1_2021-09-11_16:56:12_FullNonNeutral.csv b/...T1_2021-09-11_16:56:12_FullNonNeutral.csv → ...T1_2021-09-11_16:56:12_FullNonNeutral.csv
diff --git a/...T1_2021-09-11_16:56:12_NeutralFitPlot.png → ...T1_2021-09-11_16:56:12_NeutralFitPlot.png b/...T1_2021-09-11_16:56:12_NeutralFitPlot.png → ...T1_2021-09-11_16:56:12_NeutralFitPlot.png
diff --git a/...2_NeutralFitPlot_withNonNeutralColors.png → ...2_NeutralFitPlot_withNonNeutralColors.png b/...2_NeutralFitPlot_withNonNeutralColors.png → ...2_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_16:56:12_NonNeutral_Outliers.csv → ...21-09-11_16:56:12_NonNeutral_Outliers.csv b/...21-09-11_16:56:12_NonNeutral_Outliers.csv → ...21-09-11_16:56:12_NonNeutral_Outliers.csv
diff --git a/...sT2/progressorsT2_2021-09-11_16:57:39.txt → ...sT2/progressorsT2_2021-09-11_16:57:39.txt b/...sT2/progressorsT2_2021-09-11_16:57:39.txt → ...sT2/progressorsT2_2021-09-11_16:57:39.txt
diff --git a/...T2_2021-09-11_16:57:39_FullNonNeutral.csv → ...T2_2021-09-11_16:57:39_FullNonNeutral.csv b/...T2_2021-09-11_16:57:39_FullNonNeutral.csv → ...T2_2021-09-11_16:57:39_FullNonNeutral.csv
diff --git a/...T2_2021-09-11_16:57:39_NeutralFitPlot.png → ...T2_2021-09-11_16:57:39_NeutralFitPlot.png b/...T2_2021-09-11_16:57:39_NeutralFitPlot.png → ...T2_2021-09-11_16:57:39_NeutralFitPlot.png
diff --git a/...9_NeutralFitPlot_withNonNeutralColors.png → ...9_NeutralFitPlot_withNonNeutralColors.png b/...9_NeutralFitPlot_withNonNeutralColors.png → ...9_NeutralFitPlot_withNonNeutralColors.png
diff --git a/...21-09-11_16:57:39_NonNeutral_Outliers.csv → ...21-09-11_16:57:39_NonNeutral_Outliers.csv b/...21-09-11_16:57:39_NonNeutral_Outliers.csv → ...21-09-11_16:57:39_NonNeutral_Outliers.csv
diff --git a/neuevo/__init__.py b/neuevo/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.0"
diff --git a/neuevo/neufit.py b/neuevo/neufit.py
@@ -0,0 +1,235 @@
+# neufit: Fit a neutral community model to species abundances, e.g. from an OTU table
+#
+# For the theory behind this see Sloan et al, Environ Microbiol 2006 8:732-740.
+# To run on the example simulation data: python neufit.py sim_data.csv
+# To link with the mock taxonomy use the -t sim_taxonomy.csv option
+#
+# Copyright (C) 2018 Michael Sieber (sieber.ecoevo.de)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Github: https://github.com/misieber/neufit
+
+import os
+import scipy
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from lmfit import Parameters, Model, fit_report 
+from scipy.stats import beta 
+from statsmodels.stats.proportion import proportion_confint 
+from neuevo.neufit_utils import beta_cdf, subsample
+from neuevo.neuplot import neufit_plot
+
+'''
+All the code in the function below was written by the orginal Neufit authors, not me and is from the neufit.py file.
+neufit.py: https://github.com/misieber/neufit/blob/master/neufit.py
+
+Here are the following changes I made:
+- Turned it into a function instead of using argv as input
+    -There are specific notes on what I changed in the 'Notes: Neufit Modifyed Args Input/Output Details' section.
+-Changed was all syntax from Python2 to Python3.
+-Changed all print statments to write to file statements
+    - Made any necissary edits to allow data to be printed to file instead of terminal 
+'''
+
+
+def neufit(fnData, fnTaxonomy, output_filename,
+           dataset_type, custom_filename,
+           norm_graph, colored_graph, non_neutral,
+           non_save = False, full_non_neutral = False):
+    '''
+    Inputs:    output_filename : the name which will be at the front of all the files; ex. 'combined'
+               dataset_type : the dataset type we have from here: ('hutchKraken', 'gregTCGA')
+               custom_filename : depedant on the dataset:
+                       - hutchKraken : the name of the biom file ex.'combined_biome'
+
+               norm_graph : True/False : Prints and saves the neutral evolution graph without any coloring
+               colored_graph : True/False : Prints and saves the neutral evolution graph without any coloring
+               non_neutral : True/False : Prints and saves the most non-neutral microbes in csv file               
+
+               Default inputs to be changed mainly for testing purposes:
+               non_save : True/False, False = default, the following will not be SAVED just printed to the screen
+                           - * This is intened for testing only! *
+                           - norm_graph, colored_graph, non_neutral_csv 
+                           - **Took away this feature ** [It will still create [name]_data.csv and [name]_taxonomy.csv but will delete them after running]
+               full_non_neutral: True/False : Creates a csv file from orginal Neufit program with information about what is neutral and how far off the curve each point is
+                   - csv will have: otu_id, mean_abundance, occurrence, Kingdom, Phylum, Class, Order, Family, Genus, Species, predicted_occurrence, lower_conf_int, upper_conf_int
+
+              *The main function for the pipeline which calls all other functions*
+
+    '''
+
+    #Run Neufit
+    occurr_freqs, n_reads, n_samples, r_square, beta_fit, file_header = main_neufit(output_filename, 
+                                                                                    dataset_type,
+                                                                                    fnData, fnTaxonomy,
+                                                                                    full_non_neutral)
+
+    #Neufit Plotting and Non-neutral Outline
+    if norm_graph == True: #Neutral evolution graph, no color
+        nc_fn = neufit_plot(occurr_freqs, n_reads, n_samples, r_square, beta_fit, file_header)
+        if non_save == False:
+            save_plot(nc_fn)
+    if colored_graph == True:#Neutral evolution graph with colors
+        cc_fn = custom_color_plot(occurr_freqs, n_reads, n_samples, r_square, beta_fit, file_header)
+        if non_save == False:
+            save_plot(cc_fn)
+    if non_neutral == True:
+        non_neutral_outliers(file_header, occurr_freqs, dataset_type, non_save)
+
+    #Easier to delete the Neufit text file then to not create it
+    if non_save == True:
+        neufit_fn= str(file_header) + ".txt"
+        os.remove(neufit_fn)
+
+    #Optional cleanup step with non_save to remove taxonomy and data files - these just overwrite themseleves so not usally a big issue
+    '''
+    if non_save == True: #Delete [name]_data.csv and [name]_taxonomy.csv to reduce cluter
+        os.remove(fnData)
+        os.remove(fnTaxonomy)
+    '''
+
+
+def main_neufit(output_filename, dataset_type, _data_filename, _taxonomy_filename, full_non_neutral = False, arg_ignore_level = 0, arg_rarefaction_level = 0):
+    '''Inputs:  output_filename : the name which will be at the front of all the files; ex. 'combined'
+                dataset_type : the dataset type we have from here: ('hutchKraken', 'gregTCGA')
+                _data_filename = path of []_data.csv file needed for Neufit to run
+                _taxonomy_filename = path of []_taxonomy.csv file needed for Neufit to run
+                arg_ignore_level = 0 ; default set from orginal Neufit program
+                arg_rarefaction_level = 0; default set from orginal Neufit program 
+       Outputs: file_header = file path for neufit outpus: The path+dataGroup name + date stamp to be used for all parts of Neufit run: ex. /home/cguccion/NeutralEvolutionModeling/neufit_output/hutchKrakenAlex_combined_2021-08-26_13:24:22  
+                occurr_freqs = pandas df that Neufit created in the orginal program but didn't specifcally output orginally
+                    - Headers of csv: otu_id, mean_abundance, occurrence, Kingdom, Phylum, Class, Order, Family, Genus, Species, predicted_occurrence, lower_conf_int, upper_conf_int
+                    - I used this occur_freqs df in order to figure out which species is the most non-neutral
+                    - I believe this is what Neufit uses to physical plot the dots on the graph as well 
+                n_reads = Number of reads (from orginal program)
+                n_samples = Number of smaples (from orginal program)
+                r_square = R^2 value (from orginal program)
+                beta_fit = stats on the preformance of the model (from orginal program)
+               *Runs the main section of neufit'''
+
+    ##Added by Caitlin ~ Push output to file instead of printing to screen
+
+    #Grab and format data/time
+    time = datetime.time(datetime.now())
+    date = datetime.date(datetime.now())
+    h,s = str(time).split(".") #Split  the string into  hours/min and seconds
+
+    #Create file_header which holds the path / location for all future Neufit outpus
+    file_header = str(neufit_output_path) + "/" + str(dataset_type) + '/' + str(output_filename) + '/' + str(output_filename) + '_' + str(date) + "_" + str(h)
+
+    #Creates directory for all Neufit Outputs if it doesn't already exist 
+    dir_name = str(neufit_output_path) + "/" + str(dataset_type) + '/' + str(output_filename)
+    os.makedirs(dir_name, exist_ok=True)
+
+    #Create and open file for Neufit Output txt file
+    fn= str(file_header) + ".txt"
+    file = open(fn, 'w')
+
+    #Print statments with important info
+    print("Running dataset: " + str(dataset_type) + "Category:" + str(output_filename) + '\n')
+    ##
+
+    # Writes dataset info to Neufit output file + calculates and writes the number of samples/ reads in the file
+    file.write('Corresponding csv file: ' + _data_filename + '\n')
+    abundances = pd.read_table(_data_filename, header=0, index_col=0, sep='\t').astype(int)
+    abundances = abundances[abundances.sum(1) > arg_ignore_level]
+    file.write ('Dataset contains ' + str(abundances.shape[1]) + ' samples (sample_id, reads): \n')
+    ##Caitlin
+    #The following loop is used instead of 'print abundances.sum(0)' so that it can be written to a file
+    for index, col in abundances.iteritems():
+        col_sum = 0
+        for i in col:
+            col_sum += i
+        file.write (index + '\t' + str(col_sum) + '\n')
+    file.write ('\n')
+    ##
+
+    # Determine uniform read depth
+    if arg_rarefaction_level == 0 or arg_rarefaction_level > max(abundances.sum(0)):
+        arg_rarefaction_level = min(abundances.sum(0))
+        file.write ('rarefying to highest possible uniform read depth'),
+    else:
+        file.write ('rarefying to custom rarefaction level'),
+    file.write ('(' + str(arg_rarefaction_level) + ' reads per sample) \n')
+
+    # Optionally subsample the abundance table, unless all samples already have the required uniform read depth
+    if not all(n_reads == arg_rarefaction_level for n_reads in abundances.sum(0)):
+        abundances = subsample(abundances, arg_rarefaction_level)
+        abundances = abundances[abundances.sum(1) > 0]
+
+    # Dataset shape
+    n_otus, n_samples = abundances.shape
+    n_reads = arg_rarefaction_level
+
+    file.write ('fitting neutral expectation to dataset with ' + str(n_samples) + ' samples and ' + str(n_otus) + ' otus \n \n')
+    # Calculate mean relative abundances and occurrence frequencies
+    mean_relative_abundance = (1.0*abundances.sum(1))/n_reads/n_samples
+    occurrence_frequency = (1.0*np.count_nonzero(abundances, axis=1))/n_samples
+
+    occurr_freqs = pd.DataFrame(mean_relative_abundance, columns=['mean_abundance'])
+    if dataset_type == 'TCGA_WGS':
+        occurr_freqs.index.name = 'gOTU' #This changes the name of the first column
+    else:
+        occurr_freqs.index.name = 'otu_id'
+    occurr_freqs['occurrence'] = occurrence_frequency
+    occurr_freqs = occurr_freqs.sort_values(by=['mean_abundance'])
+
+    # Join with taxonomic information (optional)
+    if _taxonomy_filename != None: #Changed <> to !=
+        if dataset_type == 'TCGA_WGS':
+             taxonomy = pd.read_table(_taxonomy_filename, header=0, index_col=1, sep='\t')
+        else:
+            taxonomy = pd.read_table(_taxonomy_filename, header=0, index_col=0, sep='\t')
+        occurr_freqs = occurr_freqs.join(taxonomy)
+
+    # Fit the neutral model
+    params = Parameters()
+    params.add('N', value=n_reads, vary=False)
+    params.add('m', value=0.5, min=0.0, max=1.0)
+    beta_model = Model(beta_cdf)
+    beta_fit = beta_model.fit(occurr_freqs['occurrence'], params, p=occurr_freqs['mean_abundance'])
+
+    # Report fit statistics
+    r_square = 1.0 - np.sum(np.square(occurr_freqs['occurrence'] - beta_fit.best_fit))/np.sum(np.square(occurr_freqs['occurrence'] - np.mean(occurr_freqs['occurrence'])))
+    file.write (fit_report(beta_fit))
+    file.write ('\n R^2 = ' + '{:1.2f}'.format(r_square))
+    print(fit_report(beta_fit))
+    print('\n R^2 = ' + '{:1.2f}'.format(r_square))
+    print('=========================================================')
+
+    # Adding the neutral prediction to results
+    occurr_freqs['predicted_occurrence'] = beta_fit.best_fit
+    occurr_freqs['lower_conf_int'], occurr_freqs['upper_conf_int'] = proportion_confint(occurr_freqs['predicted_occurrence']*n_samples, n_samples, alpha=0.05, method='wilson')
+
+    # Save non-neutral otus (here simply determined by lying outside the confidence intervals)
+    above = occurr_freqs[occurr_freqs['occurrence'] > occurr_freqs['upper_conf_int']]
+    below = occurr_freqs[occurr_freqs['occurrence'] < occurr_freqs['lower_conf_int']]
+
+    #Create orginal non neutral output file from Neufit
+    if full_non_neutral == True:
+        pd.concat((above, below)).to_csv(str(file_header) + '_FullNonNeutral.csv')
+
+    file.close()
+
+    return(occurr_freqs, n_reads, n_samples, r_square, beta_fit, file_header)
diff --git a/neuevo/neufit_utils.py b/neuevo/neufit_utils.py
@@ -0,0 +1,59 @@
+# neufit: Fit a neutral community model to species abundances, e.g. from an OTU table
+#
+# For the theory behind this see Sloan et al, Environ Microbiol 2006 8:732-740.
+# To run on the example simulation data: python neufit.py sim_data.csv
+# To link with the mock taxonomy use the -t sim_taxonomy.csv option
+#
+# Copyright (C) 2018 Michael Sieber (sieber.ecoevo.de)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Github: https://github.com/misieber/neufit
+
+import os
+import numpy as np
+from scipy.stats import beta 
+from datetime import datetime
+
+def beta_cdf(p, N, m):
+    # Expected long term distribution under the neutral model (truncated cumulative beta-distribution)
+    return beta.cdf(1.0, N*m*p, N*m*(1.0-p)) - beta.cdf(1.0/N, N*m*p, N*m*(1.0-p))
+
+def subsample(counts, depth):
+    # Subsamples counts to uniform depth, dropping all samples without enough depth
+    for sample in counts:
+        if counts[sample].sum() >= depth:
+            flattened = np.repeat(np.arange(counts[sample].size), counts[sample])
+            subsample = np.random.choice(flattened, depth, replace=False)
+            counts[sample] = np.bincount(subsample, minlength=counts[sample].size)
+        else:
+            #CG: changed the following print statment from Python2 to Python3
+            print('dropping sample ' + sample + ' with ' + str(counts[sample].sum()) + ' reads < ' + str(depth))
+            counts = counts.drop(sample, axis=1)
+    return counts
+
+def non_negative_int(arg):
+    # Argparser type: non-negative int
+    nnint = int(arg)
+    if nnint < 0:
+        raise ArgumentTypeError(arg + ' < 0, must be non-negative')
+    return nnint