diff --git a/ISCB43/README.md b/ISCB43/README.md new file mode 100644 index 0000000..7b9be80 --- /dev/null +++ b/ISCB43/README.md @@ -0,0 +1,2 @@ +Zivich PN (August 25, 2022). "Why I use Python (and Why You Should Too)" [presentation]. +43rd Annual Conference of the International Society for Clinical Biostatistics, Newcastle upon Tyne, UK diff --git a/ISCB43/Zivich_Python_ISCB43.pdf b/ISCB43/Zivich_Python_ISCB43.pdf new file mode 100644 index 0000000..c5d0e8e Binary files /dev/null and b/ISCB43/Zivich_Python_ISCB43.pdf differ diff --git a/ISCB43/balance_intercept.py b/ISCB43/balance_intercept.py new file mode 100644 index 0000000..015b336 --- /dev/null +++ b/ISCB43/balance_intercept.py @@ -0,0 +1,173 @@ +##################################################################################################### +# +# Python code for example in +# Re: Using numerical methods to design simulations: revisiting the balancing intercept +# Paul Zivich and Rachael Ross +# +# Objective: solve for intercepts in data generating models to achieve desired marginal distribution +# +##################################################################################################### + +import warnings +import numpy as np +import pandas as pd +from scipy.optimize import root, newton + +np.random.seed(777743) + +# Setup the baseline data +n = 10000000 +d = pd.DataFrame() +d['X'] = np.random.normal(size=n) +d['C'] = 1 +print("E[X]: ", np.mean(d['X'])) + + +######################################## +# Solving for balancing intercept for A + +# Pr(A | X) = logit(\alpha_0 + alpha_coefs[0]*X) +desired_margin_a = 0.45 +alpha_coefs = [0.25] +W = np.asarray(d[['C', 'X']]) + + +def generate_pr_a(intercepts): + """Function to calculate the probability of A given an intercept + """ + alpha = np.asarray([intercepts[0]] + alpha_coefs) # Takes intercept and puts together with specified coefs + + # Calculating the probability of A given the coefficients + logit_pr_a = np.dot(W, alpha) # log-odds of A + prob_a = 1 / (1 + np.exp(-logit_pr_a)) # converting to probability of A + return prob_a # Function returns array / vector of probabilities + + +def objective_function_a(intercepts): + """Objective function to use with a root-finding algorithm to solve for the intercept that provides the desired + marginal probability of A + """ + prob_a = generate_pr_a(intercepts=intercepts) # Calculate probability of A for given intercept + marginal_pr_a = np.mean(prob_a) # Calculate the marginal probability of A + difference_from_desired = marginal_pr_a - desired_margin_a # Calculate difference between current and desired marg + return difference_from_desired # Return the current difference for the intercept + + +# Root-finding procedure for Pr(A) +root_a = newton(objective_function_a, + x0=np.asarray([0.]), + tol=1e-12, maxiter=1000) + +# Examining results +print("alpha_0: ", root_a) +print("Pr(A=1): ", np.mean(generate_pr_a(root_a))) + +######################################## +# Solving for balancing intercept for M + +# where model is Pr(M=1 | X) / Pr(M=0 | X) = ln(\beta_10 + beta_coefs[0][0]*A + beta_coefs[0][1]*X) +# Pr(M=2 | X) / Pr(M=0 | X) = ln(\beta_20 + beta_coefs[1][0]*A + beta_coefs[1][1]*X) +desired_margin_m = np.array([0.5, 0.35, 0.15]) # Desired margins +beta_coefs = [[1.2, -0.15], # Coefficients for M=1 vs. M=0 besides intercept + [0.65, -0.07]] # Coefficients for M=2 vs. M=0 besides intercept +d['A'] = np.random.binomial(n=1, # Generating values of A from model + p=generate_pr_a(root_a), # Using previously numer. approx. of intercept + size=d.shape[0]) # size is number of obs +V = np.asarray(d[['C', 'A', 'X']]) # Covariates to include in model + + +def generate_pr_m(intercepts): + """Function to calculate the probability of M for each possible value of M given intercepts + """ + beta_10 = np.asarray([intercepts[0]] + beta_coefs[0]) # Takes intercept and puts together with M=1 specified coefs + beta_20 = np.asarray([intercepts[1]] + beta_coefs[1]) # Takes intercept and puts together with M=2 specified coefs + + # Calculating denominator for probability model + denom = 1 + np.exp(np.dot(V, beta_10)) + np.exp(np.dot(V, beta_20)) + + # Calculating probability of M for each category via multinomial logit model + prob_m = np.array([1 / denom, # Probability of M=0 + np.exp(np.dot(V, beta_10)) / denom, # Probability of M=1 + np.exp(np.dot(V, beta_20)) / denom], # Probability of M=2 + ) + + # Extra step to check if probability sums to 1 for each individual + if not np.all(np.sum(prob_m, axis=0).round(7) == 1.): # (rounding to avoid approximation errors) + warnings.warn("Some Pr didn't sum to 1... :(", # Warn user if fails to sum to 1 for any individual + UserWarning) + + return prob_m # Function returns 2D array / vector of probabilities + + +def objective_function_m(intercepts): + """Objective function to use with a root-finding algorithm to solve for the intercept that provides the desired + marginal probabilities of M + """ + prob_m = generate_pr_m(intercepts=intercepts) # Calculate probability of A for given intercept + marginal_pr_m = np.mean(prob_m, axis=1) # Calculate the marginal probability of M across types + difference_from_desired = marginal_pr_m - desired_margin_m # Calculate difference between current and desired marg + return difference_from_desired[1:] # Return the current difference for all BUT M=0 + + +opt_m = root(objective_function_m, # The objective function + x0=np.asarray([0., 0.]), # Initial starting values for procedure (need 2 intercepts here!) + method='lm', tol=1e-12) # Arguments for root-finding algorithm + +# Examining results +print("beta_0: ", opt_m.x) +print("Pr(M): ", np.mean(generate_pr_m(opt_m.x), axis=1)) + +######################################## +# Solving for balancing intercept for Y + +# where the model is Y = \gamma_0 + gamma_coefs[0]*A + gamma_coefs[1]*(M=1) + gamma_coefs[2]*(M=2) +# + gamma_coefs[3]*X + Normal(0, 3) +desired_margin_y = 10. # Desired margin +gamma_coefs = [-1.55, 0.25, 0.45, 0.25] # Coefficients for Y model besides intercept + + +def random_multinomial(a, p): + """Quick function to generate random values from input multinomial probabilities + """ + s = p.cumsum(axis=0) + r = np.random.rand(p.shape[1]) + k = (s < r).sum(axis=0) + return np.asarray(a)[k] + + +d['M'] = random_multinomial(a=[0, 1, 2], # Generating values of M from model + p=generate_pr_m(opt_m.x)) # Using previously numer. approx. of intercept +d['M1'] = np.where(d['M'] == 1, 1, 0) # Creating indicator variables (for ease) +d['M2'] = np.where(d['M'] == 2, 1, 0) # Creating indicator variables (for ease) +Z = np.asarray(d[['C', 'A', 'M1', 'M2', 'X']]) # Covariates to include in model +error = np.random.normal(scale=3, size=d.shape[0]) # How error terms are simulated + + +def generate_y(intercepts): + """Function to calculate the values of Y given an intercept + """ + gamma = np.asarray([intercepts[0]] + gamma_coefs) # Takes intercept and puts together with specified coefs + + # Calculating Y values given the coefficients + y = np.dot(Z, gamma) # notice that we ignore the error term here (since safely ignorable for approx. intercepts) + return y # Function returns array / vector of Y values + + +def objective_function_y(intercepts): + """Objective function to use with a root-finding algorithm to solve for the intercept that provides the desired + marginal probability of A + """ + val_y = generate_y(intercepts=intercepts) # Calculate probability of A for given intercept + marginal_mu_y = np.mean(val_y) # Calculate the marginal mean of Y + difference_from_desired = marginal_mu_y - desired_margin_y # Calculate difference between current and desired marg + return difference_from_desired # Return the current difference for the intercept + + +# Root-finding procedure for Pr(A) +root_y = newton(objective_function_y, # The objective function + x0=np.asarray([0.]), # Initial starting values for procedure + tol=1e-12, maxiter=1000) # Arguments for root-finding algorithm + +# Examining results +print("gamma_0: ", root_y) +print("E[Y]: ", np.mean(generate_y(root_y) + error)) diff --git a/ISCB43/generate_images.py b/ISCB43/generate_images.py new file mode 100644 index 0000000..cfa7aa4 --- /dev/null +++ b/ISCB43/generate_images.py @@ -0,0 +1,121 @@ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Rectangle + +# IMAGE: flow diagram for GAN data simulation +fig, ax = plt.subplots() + +# Random noise section +for x in np.arange(0.01, 0.08, 0.01): + for y in np.arange(0.7, 0.81, 0.01): + shade = np.random.uniform(0, 1, 1) + ax.fill_between([x, x+0.01], [y, y], [y+0.01, y+0.01], + color='k', alpha=shade[0]) + +# Generator +ax.arrow(0.1, 0.75, 0.05, 0, head_width=0.01, color='k') +ax.fill_between([0.18, 0.42], [0.8, 0.8], [0.7, 0.7], color='aqua', alpha=0.2) +ax.text(0.20, 0.73, "Generator", size=16) + +# Output generator +ax.arrow(0.43, 0.75, 0.05, 0, head_width=0.01, color='k') +ax.text(0.52, 0.66, r"$X^*_1$" + "\n" + r"$X^*_2$" + "\n" + r"$X^*_3$", size=14) + +# Real Data +ax.text(0.52, 0.18, r"$X_1$" + "\n" + r"$X_2$" + "\n" + r"$X_3$", size=14) + +# Discriminator +ax.arrow(0.58, 0.72, 0.10, -0.15, head_width=0.01, color='k') +ax.arrow(0.58, 0.27, 0.10, 0.15, head_width=0.01, color='k') +ax.fill_between([0.55, 0.83], [0.55, 0.55], [0.45, 0.45], color='orange', alpha=0.2) +ax.text(0.57, 0.48, "Discriminator", size=16) +ax.arrow(0.85, 0.5, 0.05, 0, head_width=0.01, color='k') +ax.text(0.93, 0.485, "T/F", size=14) + +ax.set_ylim([-0., 1.]) +ax.set_xlim([-0., 1.]) +plt.axis('off') +plt.tight_layout() +plt.savefig("images/gan_flow.png", dpi=600, format='png') +plt.close() + +# IMAGE: flow diagram for RNN text generation +fig, ax = plt.subplots() + +# Step 1: PubMed Query +ax.text(0.02, 0.95, "1: Query PubMed") +rectangle = Rectangle((0., 0.62), 0.95, 0.38, alpha=0.2, color='aqua') +ax.add_patch(rectangle) +ax.text(0.11, 0.86, "1a: Conduct search & extract PubMed IDs") +rectangle = Rectangle((0.1, 0.92), 0.8, -0.08, alpha=0.2, color='blue') +ax.add_patch(rectangle) +ax.text(0.11, 0.76, "1b: Select random sample") +rectangle = Rectangle((0.1, 0.82), 0.8, -0.08, alpha=0.2, color='blue') +ax.add_patch(rectangle) +ax.text(0.11, 0.66, "1c: Pull meta-data from PubMed") +rectangle = Rectangle((0.1, 0.72), 0.8, -0.08, alpha=0.2, color='blue') +ax.add_patch(rectangle) + +# Step 2: text processing +ax.text(0.02, 0.55, "2: Text processing") +rectangle = Rectangle((0., 0.32), 0.95, 0.28, alpha=0.2, color='orange') +ax.add_patch(rectangle) +ax.text(0.11, 0.47, "2a: Extract abstracts") +rectangle = Rectangle((0.1, 0.53), 0.8, -0.08, alpha=0.2, color='red') +ax.add_patch(rectangle) +ax.text(0.11, 0.37, "2b: Format text to training data") +rectangle = Rectangle((0.1, 0.43), 0.8, -0.08, alpha=0.2, color='red') +ax.add_patch(rectangle) + +# Step 3: train network +ax.text(0.02, 0.24, "3: RNN") +rectangle = Rectangle((0., -0.4), 0.95, 0.7, alpha=0.2, color='green', zorder=1) +ax.text(0.18, -0.1, "Input: \nsent") +ax.text(0.775, -0.1, "Output: \nsentence") + +ax.add_patch(rectangle) +ax.text(0.275, 0.14, "sent") +ax.text(0.292, -0.33, "e") +ax.arrow(0.3, 0.12, 0, -0.05, head_width=0.01, color='k') +ax.arrow(0.3, -0.21, 0, -0.05, head_width=0.01, color='k') +ax.scatter([0.30, 0.30, 0.30, 0.30], [-0.15, -0.10, -0.05, 0.], + marker='o', c='white', edgecolors='k', zorder=2) +rectangle = Rectangle((0.29, -0.2), 0.02, 0.25, alpha=1, facecolor='none', edgecolor='k') +ax.add_patch(rectangle) +ax.arrow(0.315, -0.28, 0.09, 0.4, head_width=0.01, color='k') + +ax.text(0.415, 0.14, "ente") +ax.text(0.431, -0.33, "n") +ax.arrow(0.44, 0.12, 0, -0.05, head_width=0.01, color='k') +ax.arrow(0.44, -0.21, 0, -0.05, head_width=0.01, color='k') +ax.scatter([0.44, 0.44, 0.44, 0.44], [-0.15, -0.10, -0.05, 0.], + marker='o', c='white', edgecolors='k', zorder=2) +rectangle = Rectangle((0.43, -0.2), 0.02, 0.25, alpha=1, facecolor='none', edgecolor='k') +ax.add_patch(rectangle) +ax.arrow(0.45, -0.28, 0.09, 0.4, head_width=0.01, color='k') + +ax.text(0.55, 0.14, "nten") +ax.text(0.563, -0.33, "c") +ax.arrow(0.57, 0.12, 0, -0.05, head_width=0.01, color='k') +ax.arrow(0.57, -0.21, 0, -0.05, head_width=0.01, color='k') +ax.scatter([0.57, 0.57, 0.57, 0.57], [-0.15, -0.10, -0.05, 0.], + marker='o', c='white', edgecolors='k', zorder=2) +rectangle = Rectangle((0.5595, -0.2), 0.02, 0.25, alpha=1, facecolor='none', edgecolor='k') +ax.add_patch(rectangle) +ax.arrow(0.58, -0.28, 0.09, 0.4, head_width=0.01, color='k') + +ax.text(0.675, 0.14, "tenc") +ax.text(0.687, -0.33, "e") +ax.arrow(0.695, 0.12, 0, -0.05, head_width=0.01, color='k') +ax.arrow(0.695, -0.21, 0, -0.05, head_width=0.01, color='k') +ax.scatter([0.695, 0.695, 0.695, 0.695], [-0.15, -0.10, -0.05, 0.], + marker='o', c='white', edgecolors='k', zorder=2) +rectangle = Rectangle((0.6855, -0.2), 0.02, 0.25, alpha=1, facecolor='none', edgecolor='k') +ax.add_patch(rectangle) + +ax.set_ylim([-0.6, 1.1]) +ax.set_xlim([-0., 0.96]) +plt.axis('off') +plt.tight_layout() +plt.savefig("images/rnn_flow.png", dpi=600, format='png') +plt.close() diff --git a/ISCB43/generate_slides.py b/ISCB43/generate_slides.py new file mode 100644 index 0000000..f03c373 --- /dev/null +++ b/ISCB43/generate_slides.py @@ -0,0 +1,487 @@ +import numpy as np +import pandas as pd +from pylatex import (Document, Section, Subsection, Package, Command, + Figure, NoEscape, LineBreak, Itemize, Enumerate) + +tex_file_name = "Zivich_Python_ISCB43" + + +class Frame: + def __init__(self, title): + doc.append(NoEscape(r'\begin{frame}{'+str(title)+'}')) + + def append(self, text): + doc.append(text) + + def end(self): + doc.append(NoEscape(r'\end{frame}')) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end() + + +def center(document, text): + document.append(NoEscape('{')) + document.append(Command('centering')) + document.append(text) + document.append(NoEscape('}')) + + +##################### +# TeX setup +doc = Document(tex_file_name, documentclass="beamer") + +doc.preamble.append(Command('usetheme', 'Copenhagen')) +doc.preamble.append(Command('usecolortheme', 'whale')) + +doc.packages.append(Package('amsmath')) +doc.packages.append(Package('xcolor')) +doc.packages.append(Package('graphicx')) +doc.packages.append(Package('fontawesome5')) +doc.packages.append(Package('pythonhighlight')) + +doc.preamble.append(Command('usefonttheme', 'serif', 'onlymath')) + +doc.preamble.append(Command('setbeamercovered', 'transparent')) +doc.preamble.append(Command('setbeamertemplate', 'navigation symbols', + extra_arguments=" ")) +doc.preamble.append(Command('setbeamertemplate', 'page number in head/foot', + extra_arguments=NoEscape(r'\insertframenumber'))) +doc.preamble.append(Command('setbeamertemplate', 'headline', + extra_arguments=" ")) + +doc.preamble.append(Command('title', NoEscape(r'\huge Why I Use Python \\' + r'\large (and Why You Should Too)'), + 'Why I Use Python')) +doc.preamble.append(Command('author', NoEscape(r'Paul Zivich \\~\\ Institute of Global Health and Infectious Diseases\\' + r'Causal Inference Research Laboratory \\' + r'University of North Carolina at Chapel Hill'), + 'Paul Zivich')) +doc.preamble.append(Command('date', NoEscape(r'August 25, 2022'))) + +################################ +# Title + +doc.append(NoEscape(r'\maketitle')) + +################################ +# Acknowledgements + +with Frame(title="Acknowledgements") as f: + f.append(NoEscape(r"Supported by NIH T32-AI007001." + r"\footnote[frame]{Footnotes are reserved asides for possible later discussion or questions}" + r"\\~\\~\\")) + f.append(NoEscape(r"Python: \faPython \\~\\~\\")) + center(document=f, + text=NoEscape(r"\faEnvelope \quad pzivich@unc.edu \qquad " + r"\faTwitter \quad @PausalZ \qquad" + r"\faGithub \quad pzivich\\")) + f.append(NoEscape(r"~\\~\\ Slides and code at https://github.com/pzivich/Presentations")) + +################################ +# Outline + +with Frame(title="Outline") as f: + f.append(NoEscape(r"My background \\~\\")) + f.append(NoEscape(r"Value add of \faPython\footnote[frame]{I am going to pick on R, please save angry emails till " + r"after the presentation} \\~\\")) + f.append(NoEscape(r"Illustrative applications \\~\\")) + f.append(NoEscape(r"Installation and Conclusions")) + +################################ +# My background + +with Frame(title="About me") as f: + f.append("An epidemiologist working in methods and infectious diseases") + f.append(NoEscape(r"\\~\\")) + f.append(NoEscape(r"Using \faPython\; since 2016")) + with doc.create(Itemize()) as itemize: + itemize.add_item("Largely self-taught") + f.append(NoEscape(r"~\\")) + f.append(NoEscape("Active contributor")) + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"\texttt{zEpid}, " + r"\texttt{delicatessen}\footnote[frame]{Zivich PN, et al. (2022) Delicatessen: " + r"M-Estimation in Python. \textit{arXiv:2203.11300}}, " + r"\texttt{MossSpider}\footnote[frame]{Zivich PN, et al. (2022) Targeted maximum " + r"likelihood estimation of causal effects with interference: A simulation study. " + r"\textit{Statistics in Medicine}}")) + itemize.add_item(NoEscape(r"\texttt{lifelines}")) + f.append(NoEscape(r"~\\")) + f.append(NoEscape(r"\faPython\; is my primary software")) + with doc.create(Itemize()) as itemize: + itemize.add_item("Also use R, SAS") + +################################ +# Claim + +with Frame(title="A Software Philosophy") as f: + f.append("To be a good epidemiologist / biostatistician / data scientist / someone who works with " + "data, familiarity with multiple software languages is important") + with doc.create(Itemize()) as itemize: + itemize.add_item("No software is complete for all tasks") + itemize.add_item(NoEscape(r"\textit{Lingua franca} of fields will change")) + itemize.add_item("Harder to be replaced") + f.append(NoEscape(r"~\\ Why \faPython\; should be added to your repertoire")) + +################################ +# Python background + +with Frame(title=NoEscape(r"What is \faPython ?")) as f: + f.append("High-level programming language") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Interpreted")) + itemize.add_item(NoEscape(r"Object-oriented")) + itemize.add_item(NoEscape(r"Free, open-source")) + itemize.add_item(NoEscape(r"Supported for all major platforms")) + itemize.add_item(NoEscape(r"Scales to available hardware")) + f.append(NoEscape(r"~\\Some advantages from my perspective")) + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Language features")) + itemize.add_item(NoEscape(r"Cross-software interactions")) + itemize.add_item(NoEscape(r"Popularity")) + +################################ +# Advantages of Specifics + +with Frame(title=NoEscape(r"Advantage: language-specific features")) as f: + f.append(NoEscape(r"Class objects \\~\\")) + f.append(NoEscape(r"Namespaces and modules \\~\\")) + f.append(NoEscape(r"Readability \\~\\")) + f.append(NoEscape(r"Accuracy")) + +with Frame(title=r"Class objects") as f: + f.append("Object that hold") + with doc.create(Itemize()) as itemize: + itemize.add_item("Functions, other objects") + itemize.add_item("Each function can have unique parameters and docs") + itemize.add_item("Store hidden parameters for testing") + f.append(NoEscape(r"~\\")) + f.append(NoEscape(r"\inputpython{generate_slides.py}{9}{15}")) + +with Frame(title=NoEscape(r"Class objects")) as f: + f.append(NoEscape(r"\includegraphics[width=0.9\linewidth]{images/r_tmle.PNG}")) + +with Frame(title=r"Class objects") as f: + f.append(NoEscape(r"\inputpython{stat_examples.py}{6}{13}")) + +with Frame(title=r"Namespace of modules") as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/R-namespace.png}" + r"\end{center}")) + f.append(NoEscape(r"~\\")) + f.append(NoEscape(r"R's namespace conflicts in my work")) + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Network analysis in R: \texttt{sna}, \texttt{igraph}")) + itemize.add_item(NoEscape(r"Non-overlapping functionalities")) + itemize.add_item(NoEscape(r"Overlapping functionalities have conflicts")) + +with Frame(title=r"Namespace of modules") as f: + f.append(NoEscape(r"Not a problem in \faPython \\~\\")) + f.append(NoEscape(r"\inputpython{numpy_example.py}{1}{5}")) + +with Frame(title=NoEscape(r"Readability")) as f: + f.append(NoEscape(r"\includegraphics[width=0.9\linewidth]{images/R_bad-loop.PNG}")) + +with Frame(title=r"Readability") as f: + f.append(NoEscape(r"\inputpython{loop_example.py}{1}{14}")) + +with Frame(title=NoEscape(r"Accuracy")) as f: + f.append(NoEscape(r"\includegraphics[width=1.0\linewidth]{images/r_floating_point.png}")) + +with Frame(title=NoEscape(r"Accuracy" + r"\footnote[frame]{Julia also presents this correctly}")) as f: + f.append(NoEscape(r"\includegraphics[width=1.0\linewidth]{images/python_floating_point.png}")) + +################################ +# Ability to Interact + +with Frame(title=r"Advantage: cross-software interactions") as f: + f.append("Python is a good glue language") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Easily interacts with other software")) + with doc.create(Itemize()) as itemize_inner: + itemize_inner.add_item(NoEscape(r"\texttt{C}, \texttt{C++}")) + itemize.add_item(NoEscape(r"Interact with other software:")) + with doc.create(Itemize()) as itemize_inner: + itemize_inner.add_item(NoEscape(r"\texttt{R}: \texttt{RPy2}")) + itemize_inner.add_item(NoEscape(r"\texttt{Stan}: \texttt{PyStan}")) + itemize_inner.add_item(NoEscape(r"\texttt{Julia}: \texttt{PyJulia}")) + itemize_inner.add_item(NoEscape(r"\texttt{SAS}: \texttt{SASPy}")) + + +with Frame(title="Advantage: cross-software interactions") as f: + f.append(NoEscape(r"All slides made with \faPython\; and \LaTeX")) + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Using \texttt{pylatex}")) + f.append(NoEscape(r"\inputpython{generate_slides.py}{33}{44}")) + +################################ +# Popularity across areas + +with Frame(title=NoEscape(r"Advantage: popularity")) as f: + f.append(NoEscape(r"\includegraphics[width=1.0\linewidth]{images/python_trends.png}")) + + +with Frame(title=NoEscape(r"Advantage: popularity")) as f: + f.append("Combination of:") + with doc.create(Itemize()) as itemize: + itemize.add_item("Programmers") + itemize.add_item("Scientists") + itemize.add_item("Statisticians") + f.append(NoEscape(r"~\\")) + f.append("Wide support for use-cases") + +with Frame(title=NoEscape(r"Example: Black Holes")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.55\linewidth]{images/sagittarius-a.jpg}" + r"\end{center}")) + f.append(NoEscape(r"Computations and image processing done using \faPython" + r"\footnote[frame]{Akiyama K, et al. (2022) First Sagittarius A* Event Horizon Telescope " + r"Results. I. The Shadow of the Supermassive Black Hole in the Center of the Milky Way. \textit{" + r"The Astrophysical Journal Letters} 930.2 (2022): L12}")) + with doc.create(Itemize()) as itemize: + itemize.add_item("The telescope array generates >350 terabytes of data daily") + +# with Frame(title=NoEscape(r"Example: Image Generation")) as f: +# f.append("DALLE-2: image generation from text descriptions") +# f.append(NoEscape(r"\begin{center}" +# r"\includegraphics[width=0.8\linewidth]{images/dalle2.png}" +# r"\end{center}")) + +# with Frame(title=NoEscape(r"Example: Image Generation" +# r"\footnote[frame]{https://github.com/huggingface/diffusers/releases/tag/v0.2.3}")) as f: +# f.append(NoEscape(r"\begin{center}" +# r"\includegraphics[width=0.75\linewidth]{images/python_dalle2.PNG}" +# r"\end{center}")) + +################################ +# Examples + +with Frame(title="Illustrative Applications") as f: + f.append("Examples") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Basic statistical applications")) + itemize.add_item(NoEscape(r"Plasmode data simulation with GANs")) + itemize.add_item(NoEscape(r"Scientific abstract text generator")) + +# Basics +with Frame(title="Basics: Regression") as f: + f.append(NoEscape(r"\inputpython{stat_examples.py}{15}{22}")) + +with Frame(title="Basics: Inverse Probability Weighting") as f: + f.append(NoEscape(r"\inputpython{stat_examples.py}{24}{36}")) + f.append(NoEscape(r"\footnote[frame]{Can also be done using \texttt{zEpid}}")) + +with Frame(title="Basics: Survival Analysis") as f: + f.append(NoEscape(r"\inputpython{stat_examples.py}{38}{46}")) + +# GAN for simulations +with Frame(title="Plasmode Simulations with GAN") as f: + f.append("Generative adversarial neural network (GAN) to generate data") + f.append(NoEscape(r"\footnote[frame]{Athey S et al. (2021). Using Wasserstein generative adversarial networks for " + r"the design of Monte Carlo simulations. \textit{Journal of Econometrics}}")) + f.append(NoEscape(r"~\\~\\")) + f.append("Generate new data from existing data") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Avoid arbitrary data generating decisions")) + itemize.add_item(NoEscape(r"Reflect performance in your particular application")) + itemize.add_item(NoEscape(r"Share data without re-identification")) + f.append(NoEscape(r"~\\")) + f.append("Less than 150 lines") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Compatible with arbitrary input data")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_dgm.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_flow.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_generated_i1.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_generated_i100.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_generated_i500.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_generated_i2000.png}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Plasmode Simulations with GAN")) as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.8\linewidth]{images/gan_generated_i10000.png}" + r"\end{center}")) + +# RNN for text generation +with Frame(title="Text Generation with RNN") as f: + f.append("Recurrent neural network (RNN) to generate abstracts") + f.append(NoEscape(r"\footnote[frame]{Code available at https://github.com/pzivich/RNN-Abstract-Generator}")) + f.append(NoEscape(r"\textsuperscript{,}")) + f.append(NoEscape(r"\footnote[frame]{Sutskever I, Martens J, \& Hinton GE (2011). Generating text with recurrent " + r"neural networks. In \textit{ICML}}")) + f.append(NoEscape(r"~\\~\\")) + f.append("Generate abstracts focusing on causal inference") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Whole process in \faPython")) + itemize.add_item(NoEscape(r"Train using published abstracts")) + with doc.create(Itemize()) as itemize_inner: + itemize_inner.add_item(NoEscape(r"Query PubMed")) + f.append(NoEscape(r"~\\")) + f.append("Less than 300 lines") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Written generally, so could be re-trained for other topics")) + +with Frame(title=r"RNN overview") as f: + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.95\linewidth]{images/rnn_flow.png}" + r"\end{center}")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} Randomized control trials have been criticized as \\~\\")) + f.append(NoEscape(r"\textcolor{white}{\texttt{Output:} Randomized control trials have been criticized as a " + r"mediation " + r"analysis of social schools and other time series (asthma multiple compositions, " + r"and confounding. maps that each of the hazard ratio of their assumptions that may examine " + r"the effects of causal inference network (rct) and the causal relationship between disease " + r"construction and intervention (i.e., all motivating population status are presented to assess " + r"robustness framework for interested in the method to infer the substance use and investigating " + r"the associations between structured patients}")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} Randomized control trials have been criticized as \\~\\")) + f.append(NoEscape(r"\texttt{Output:} Randomized control trials have been criticized as a mediation " + r"analysis of social schools and other time series (asthma multiple compositions, " + r"and confounding. maps that each of the hazard ratio of their assumptions that may examine " + r"the effects of causal inference network (rct) and the causal relationship between disease " + r"construction and intervention (i.e., all motivating population status are presented to assess " + r"robustness framework for interested in the method to infer the substance use and investigating " + r"the associations between structured patients")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} Inverse probability of treatment weights were \\~\\")) + f.append(NoEscape(r"\textcolor{white}{\texttt{Output:} Inverse probability of treatment weights were associated " + r"with all " + r"interventions or as a causal association between long-term sources and covariate and " + r"the consumption of the behavioral research results. in this article, we describe the method " + r"of the results of responses and work in a variety of high-current for genetic variants are " + r"problematic in statisticians}")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} Inverse probability of treatment weights were \\~\\")) + f.append(NoEscape(r"\texttt{Output:} Inverse probability of treatment weights were associated with all " + r"interventions or as a causal association between long-term sources and covariate and " + r"the consumption of the behavioral research results. in this article, we describe the method " + r"of the results of responses and work in a variety of high-current for genetic variants are " + r"problematic in statisticians")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} results were statistically significant (p=0. \\~\\")) + f.append(NoEscape(r"\textcolor{white}{\texttt{Output:} Results were statistically significant (p=0.011)}")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Input:} results were statistically significant (p=0. \\~\\")) + f.append(NoEscape(r"\texttt{Output:} Results were statistically significant (p=0.011)")) + +with Frame(title=r"") as f: + f.append(NoEscape(r"\texttt{Output:} a causal inference approach to interpret, and the a propensity score " + r"(ps)")) + f.append(NoEscape(r"\\~\\~\\")) + f.append(NoEscape(r"\texttt{Output:} controlling for pregnancy manifererisequally associated \\~\\~\\")) + f.append(NoEscape(r"\texttt{Output:} Was protective but not statistically significant (p=0.02)\\")) + f.append(NoEscape(r"\texttt{Output:} We found no evidence of a causal effect (p=0.012)")) + +################################ +# Installation & IDEs + +with Frame(title=NoEscape(r"Getting Started with \faPython")) as f: + f.append(r"https://www.python.org/downloads/") + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=0.95\linewidth]{images/python_org.PNG}" + r"\end{center}")) + f.append(NoEscape(r"But often will want multiple versions of \faPython\; available")) + +with Frame(title=NoEscape(r"A Better Way...")) as f: + f.append(NoEscape(r"Use \texttt{pyenv} \footnote[frame]{A good introduction is available at " + r"https://realpython.com/intro-to-pyenv/} ~\\~\\")) + f.append(NoEscape(r"\begin{center}" + r"\includegraphics[width=1.0\linewidth]{images/pyenv_versions.PNG}" + r"\end{center}")) + +with Frame(title=NoEscape(r"Getting Started with \faPython")) as f: + f.append(NoEscape(r"Integrated Development Environment (IDE)")) + with doc.create(Itemize()) as itemize: + itemize.add_item("PyCharm") + itemize.add_item("Jupyter Notebook") + itemize.add_item("Atom") + itemize.add_item("RStudio") + +with Frame(title=NoEscape(r"Essential Packages")) as f: + f.append("Basics") + with doc.create(Itemize()) as itemize: + itemize.add_item("NumPy, SciPy, pandas") + f.append("Statistics") + with doc.create(Itemize()) as itemize: + itemize.add_item("statsmodels, lifelines") + f.append("Visualization") + with doc.create(Itemize()) as itemize: + itemize.add_item("matplotlib, seaborn") + f.append("Machine learning") + with doc.create(Itemize()) as itemize: + itemize.add_item("sci-kit learn, torch") + +with Frame(title=NoEscape(r"Learning \faPython")) as f: + f.append(NoEscape(r"A number of online resources \\~\\")) + f.append("Some I've made:") + with doc.create(Itemize()) as itemize: + itemize.add_item("https://github.com/pzivich/Python-for-Epidemiologists") + itemize.add_item("https://github.com/pzivich/publications-code") + itemize.add_item(NoEscape(r"Smith MJ et al. (2022). Introduction to computational causal inference using " + r"reproducible Stata, R, and Python code: A tutorial. " + r"\textit{Statistics in Medicine}, 41(2), 407-432.")) + f.append(NoEscape(r"~\\")) + f.append("What worked for me:") + with doc.create(Itemize()) as itemize: + itemize.add_item(NoEscape(r"Replicate a completed project in \faPython")) + itemize.add_item(NoEscape(r"Then start a project in \faPython")) + +################################ +# Conclusion +with Frame(title=r"Conclusions") as f: + f.append(NoEscape(r"Be familiar with more than one software")) + f.append(NoEscape(r"\\~\\")) + f.append(NoEscape(r"Strongly consider \faPython \; as the next")) + with doc.create(Itemize()) as itemize: + itemize.add_item("Language features") + itemize.add_item("Cross-software capabilities") + itemize.add_item("Popularity") + f.append(NoEscape(r"~\\")) + f.append(NoEscape(r"Uptake in epidemiology / biostatistics is low")) + with doc.create(Itemize()) as itemize: + itemize.add_item("More dominated by comp sci / data science") + itemize.add_item("Lots of opportunity for contributions") + +with Frame(title=r"") as f: + f.append(NoEscape(r"\huge \centering Questions?")) + +################################ +# END OF DOCUMENT + +doc.generate_pdf(clean_tex=True, clean=True, compiler='pdfLaTeX') diff --git a/ISCB43/loop_example.py b/ISCB43/loop_example.py new file mode 100644 index 0000000..f8fc853 --- /dev/null +++ b/ISCB43/loop_example.py @@ -0,0 +1,14 @@ +cost = 0. +grocery_items = ["apple", "celery", "bread"] +sale = False + +for item in grocery_items: + if item == "apple": + cost = cost + 1.50 + if item == "celery": + cost = cost + 3.50 + if item == "bread": + if sale: + cost = cost + (2.50*0.9) + else: + cost = cost + 2.50 diff --git a/ISCB43/numpy_example.py b/ISCB43/numpy_example.py new file mode 100644 index 0000000..e146642 --- /dev/null +++ b/ISCB43/numpy_example.py @@ -0,0 +1,5 @@ +import math +import numpy + +math.sqrt(25) +numpy.sqrt(25) diff --git a/ISCB43/stat_examples.py b/ISCB43/stat_examples.py new file mode 100644 index 0000000..339e83d --- /dev/null +++ b/ISCB43/stat_examples.py @@ -0,0 +1,46 @@ +# Loading data +import pandas as pd +d = pd.read_csv("example_data.csv") +d.info() + +# Targeted Maximum Likelihood Estimation +from zepid.causal.doublyrobust import TMLE + +tmle = TMLE(d, exposure="a", outcome="y") +tmle.exposure_model("x + z + x:z", bound=0.01) +tmle.outcome_model("a + x + z + a:x") +tmle.fit() +tmle.summary(decimal=2) + +# Logistic Regression +import statsmodels.api as sm +import statsmodels.formula.api as smf + +fm = smf.glm("y ~ x + z", + d, + family=sm.families.Binomial()).fit() +print(fm.summary()) + +# Inverse Probability Weighting +fm = smf.glm("a ~ x_1 + x_2 + x_3", + d, + family=sm.families.Binomial()).fit() +pi_a = fm.predict() + +ipw = 1 / (d['a'] * pi_a + (1-d['a'])*(1-pi_a)) + +f = sm.families.family.Binomial(sm.families.links.identity()) +msm = smf.gee("y ~ a", d.index, d, + weights=ipw, + family=f).fit() +print(msm.summary()) + +# Cox Proportional Hazards +from lifelines import CoxPHFitter + +cph = CoxPHFitter() +cph.fit(d[['time', 'delta', 'a', 'z']], + duration_col='time', + event_col='delta', + strata='x') +cph.print_summary()