From 43dab99c2eee1ee874ee38fe87b80b4d7d36c543 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky Date: Sat, 11 Nov 2023 14:09:14 -0800 Subject: [PATCH] This is also a deprecated dev/test snippet --- tests/context_retrieve.py | 125 ++++++++++++++++++++++++++++++++++++++ tests/input_file.txt | 1 + 2 files changed, 126 insertions(+) create mode 100644 tests/context_retrieve.py create mode 100644 tests/input_file.txt diff --git a/tests/context_retrieve.py b/tests/context_retrieve.py new file mode 100644 index 0000000..f4c4fc2 --- /dev/null +++ b/tests/context_retrieve.py @@ -0,0 +1,125 @@ +import os +import openai +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain.document_loaders import TextLoader + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from langchain.chat_models import ChatOpenAI +from langchain.chains import RetrievalQA +from langchain import PromptTemplate + +import re +import requests +import xml.etree.ElementTree as ET + +from fragment import Fragment +from VectorDatabase import Latern + + + +# OpenAI Setup +OPEN_API_KEY = "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" +# openai.api_key = os.getenv(openai_api_key) +os.environ['OPENAI_API_KEY'] = OPEN_API_KEY + +def getPmcPaper(pmcid): + """ + """ + url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML' + req = requests.get(url) + res = req.text + return res + +def extractMethodsFromPmcPaper(paper): + """ + """ + tree = ET.fromstring(paper) + mtext = [] + for sec in tree.iter('sec'): + for title in sec.iter('title'): + if isinstance(title.text, str): + if re.search('methods', title.text, re.IGNORECASE): + mtext.extend(list(sec.itertext())) + return " ".join(mtext) + +def preprocess(input_text): + """ + """ + processed_data = input_text.replace("\n","") + return processed_data + +def get_embeddings(fname): + """ + """ + loader = TextLoader(fname) + documents = loader.load() + text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0) + + docs = text_splitter.split_documents(documents) + + emb = OpenAIEmbeddings() + input_texts = [d.page_content for d in docs] + + input_embeddings = emb.embed_documents(input_texts) + text_embeddings = list(zip(input_texts, input_embeddings)) + + return text_embeddings, emb + +def saveFassIndex(fname, sname, ): + """ + """ + txt_embs, emb = get_embeddings(docs) + faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) + faissIndex.save_local(sname) + # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings()) + # faissIndex.save_local("input_doc") + +def Query(input_query, faiss_obj): + chatbot = RetrievalQA.from_chain_type( + llm=ChatOpenAI( + openai_api_key=OPEN_API_KEY, + temperature=0, model_name="gpt-3.5-turbo", max_tokens=50 + ), + chain_type="stuff", + retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1}) + ) + template = """ {query}? """ + prompt = PromptTemplate( + input_variables=["query"], + template=template, + ) + print(chatbot.run( + prompt.format(query=input_query) + )) + + +def main(): + text = getPmcPaper(pmcid) + + methods_text = preprocess(extractMethodsFromPmcPaper(text)) + fname = 'input_file.txt' + sname = 'input_doc' + with open(fname, 'w') as file: + file.write(methods_text) + # print(methods_text) + txt_embs, emb = get_embeddings(fname) + + fragments = [] + for txt, embs in txt_embs: + fragment = Fragment(pmcid, 'methods', txt, embs) + fragments.append(fragment) + + latern = Latern() + latern.insertEmbeddings(fragments) + + # retreieve. PMC + faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) + inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" + Query(inp_query, faissIndex) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/input_file.txt b/tests/input_file.txt new file mode 100644 index 0000000..1eea3d8 --- /dev/null +++ b/tests/input_file.txt @@ -0,0 +1 @@ +Methods Mouse line generation and validation All animal work was performed in accordance with approved Yale IACUC protocols (#2019–11167 and #2020–07271). The HACNS1 and chimpanzee ortholog lines were generated at the Yale Genome Editing Center using standard gene targeting techniques in mouse ES cells 71 . C57BL/6J- A w−J /J mouse ES cells, generated by the Yale Genome Editing Center from C57BL/6J- A w−J /J mice obtained from The Jackson Laboratory (RRID:IMSR_JAX:000051), were edited by electroporation of a GFP cloning vector containing human (1241 bp) or chimpanzee (1240 bp) sequence flanked by C57BL/6 J mouse sequence homology arms, floxed pPGKneo vector, and diphtheria toxin sequence (Supplementary Fig.  1 A) 72 . The genomic coordinates of the human (hg19; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/ ), chimpanzee (panTro4; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001515.6/ ), and mouse (mm9; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.18/ ) sequences used in the editing constructs, including the mouse homology arm sequences, are listed in Supplementary Data  1 73 . Positive clones were karyotyped and only clones exhibiting a normal mouse karyotype were used for blastocyst injection. Resulting G0 chimeras were backcrossed to wild type C57BL/6 J (RRID: IMSR_JAX:000664) and crossed with an actin-Cre C57BL/6 J mouse line to remove the neo cassette. All mice used in our analysis were from F9 or later generations. Mice were maintained in a Yale Animal Resources Center (YARC) managed facility under a standard 12 h light/dark cycle and environmental monitoring according to YARC policies and procedures. Genotyping primers specific to HACNS1 , chimpanzee, and mouse orthologs are listed in Supplementary Data  10 . Cloning primers listed in Supplementary Data  10 were used to amplify edited loci for cloning and Sanger sequencing for comparison to the hg19 or panTro4 sequence. Sanger sequencing data is available at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . The sequence identity between the human (hg19, chr2:236773456-236774696) and chimpanzee alleles (panTro4, chr2B:241105291-241106530) is 98.2% (22 substitutions total, of which 15 are fixed in humans). Human-specific substitutions were defined as fixed if the derived allele frequency in dbSNP (v153) was >=0.9999 and if the ancestral sequence state was conserved between chimpanzee, rhesus macaque, orangutan, and marmoset. We provide a detailed analysis of sequence differences between the human, chimpanzee and mouse orthologs in the Supplemental Note (Supplementary Materials). HACNS1-GBX2 locus TAD coordinates (hg19 chr2:236655261-237135261) are from H1 human ES cell Hi-C data; HACNS1 and GBX2 are present in the same TAD and GBX2 is the only annotated protein-coding gene whose promoter is included in this TAD 32 . Copy number verification qPCR was performed using genomic DNA from three F9 mice from each line using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) and the StepOnePlus Real-Time PCR System (Applied Biosystems) with primers listed in Supplementary Data  10 . All biological replicates of each genotype were run in triplicate and Ct values of each were normalized to a control region on a different chromosome (see Supplementary Data  10 ). Primary qPCR results are available as Source Data. Chromatin Immunoprecipitation, ChIP-qPCR and ChIP-seq Tissue for chromatin preparation was collected from E11.5 forelimb and hindlimb bud pairs or pharyngeal arch tissue from HACNS1 and chimpanzee ortholog line heterozygous crosses to obtain pooled, litter matched limb bud or pharyngeal arch samples for all three genotypes ( HACNS1 homozygous, chimpanzee ortholog line, and wild type). Two biological replicates were used per genotype per tissue, each with tissue pooled from three embryos. Pooled tissue was crosslinked and sonicated as previously described 74 . Chromatin for each genotype, tissue, and replicate was used for H3K27ac or H3K4me2 immunoprecipitation with 7.5 μg antibody and ~5 μg tissue per ChIP assay using Active Motif #39133 (RRID: AB_2561016) and Active Motif #39913 (RRID: AB_2614976) as previously described 74 , 75 . ChIP-qPCR was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) with primers listed in Supplementary Data  11 . Samples were sequenced (2 × 100 bp) using standard Illumina protocols on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples of the same tissue type were multiplexed and sequenced on a single lane. Reference genomes edited to replace the mouse ortholog of HACNS1 with the human or chimpanzee sequence were built using Bowtie (v2.2.8; RRID: SCR_005476) 76 . ChIP-seq raw reads were aligned to the mm9, mm9 with chimpanzee ortholog, or humanized mm9 reference genome using Bowtie with -sensitive and -no-unal settings. GC composition was assessed using fastQC and showed that GC content and bias were consistent across all experiments 77 , 78 . Tag directories for each experiment were generated using makeTagDirectory in HOMER with default settings and standard normalization to 10 million tags, and were used to generate bigwig files for visualization with makeUCSCfile 23 . All peaks were called with HOMER (v4.9.1 RRID: SCR_010881) using default settings for -histone (IP vs input fold change = 4, p  = 0.0001, peak size = 500, minDist = 1000) 23 . All differential peaks were called with DESeq2 implemented in HOMER using getDifferentialPeaksReplicates.pl with default settings (fold change cutoff = 2, FDR cutoff = 5%); briefly, reads from each comparison are pooled, with ChIP and inputs pooled separately, such that new peaks are called and used for quantitative comparison between genotypes 23 , 24 . The complete datasets of all peaks tested in differential analyses can be found at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . RNA extraction and RT-qPCR E11-E12 embryos were collected from six HACNS1 homozygous, chimpanzee ortholog line, or wild type litters generated by crossing homozygous animals for each line. All embryos within each genotype group were ordered based on stage (>70 total embryos) and were divided into six timepoint groups per genotype consisting of forelimb or hindlimb buds from 4-6 pooled embryos per time point per genotype per tissue. RNA was purified using the Qiagen miRNeasy Kit (#74106). Invitrogen Superscript III Reverse Transcription Kit (#18080-051) was used to prepare cDNA from each sample. qPCR with the resulting cDNA was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577). All samples were analyzed in triplicate using primers listed in Supplementary Data  12 and Ct values of Gbx2 were normalized to Hprt1 . Primary RT-qPCR results are available as Source Data. Whole mount in situ hybridization E11-E12 mouse embryos were collected from HACNS1 homozygous ( n  = 7 litters), chimpanzee ortholog line ( n  = 8 litters), and wild type ( n  = 12 litters) homozygous crosses. Embryos were fixed and hybridized with the same preparation of antisense Gbx2 mRNA probe under identical conditions as previously described 78 , 79 . The Gbx2 probe used for hybridization contains the full mouse consensus CDS sequence (CCDS15150.1); NCBI CCDS Release 23; https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=ALLFIELDS&DATA=CCDS15150.1&ORGANISM=10090&BUILDS=CURRENTBUILDS (NCBI CCDS Release 23 CCDS15150.1). The 178 embryos (55 from the HACNS1 knock-in line, 52 from the chimpanzee ortholog line, and 71 from wild type) were divided into temporally-ordered sextiles within the E11-E12 window (~40–48 somites, although we did not rely on somite counts for staging) based on measurement of crown-rump length for each individual embryo 35 . For the data shown in Fig.  3B , embryos were assessed for staining pattern by three individuals blinded to genotype under a stereo microscope (Leica S6D). For the data shown in Supplementary Fig.  3A, B embryos were annotated by a single scorer blinded to genotype. The scoring scheme was based on previous studies, notably to assess whole-mount gene expression patterns as described in the VISTA Enhancer Browser ( http://enhancer.lbl.gov/ ) 10 , 34 , 36 . Embryos were assigned to one of eleven categories of Gbx2 expression pattern based on the anterior-posterior and proximal-distal localization of staining as well the intensity (strong versus weak) of staining: 1: anterior and posterior (AP); 2: anterior distal and posterior distal (APD); 3: distal (D); 4: anterior distal (AD); 5: anterior (A); 6: weak anterior and posterior (APL); 7: weak anterior (AL); 8: weak distal (DL); 9: weak anterior and posterior distal (APDL); 10: weak anterior distal (ADL); 11: no staining (N). Categories were merged for clarity in Fig.  3B in the following manner: categories 1–3: anterior and posterior; categories 4–5: anterior only; categories 6–10: weak staining. See Fig.  3B for representative images of staining patterns illustrating the scoring scheme used for qualitative assessment of expression. Representative images were taken using a Zeiss Stemi 2000-C stereomicroscope fitted with an AxioCam MRc5 digital camera and Zeiss AxioVision software. Images and associated annotations are available as Source Data. Single-cell RNA-sequencing Sample preparation Tissue for scRNA-seq was collected at E11.5 from two human ortholog line homozygous litters, two chimpanzee ortholog line homozygous litters, and two wild type litters. Embryos were staged as previously described in order to obtain samples from stage-matched T3 embryos from each genotype. Left hindlimb buds from three embryos per genotype per replicate were pooled. Following dissection, the tissue was immediately placed in CMFSG saline–glucose solution (1x Calcium–magnesium-free phosphate buffered saline from Thermo Fisher Scientific #21-040-CV with 0.1% glucose from Corning 45% Glucose #45001-116) on ice. Gibco TrypLE Express digestion solution was used for cellular dissociation (Thermo Fisher Scientific # 2605010). The dissociation reaction was stopped using 1xDMEM (ATCC 30–2002) with 10% heat-inactivated Fetal Bovine Serum (Sigma-Aldrich #F4135). The dissociated cells were filtered through a 40 μM strainer and harvested by centrifugation at 4 °C. Cells were washed and resuspended in 1x Calcium–magnesium-free phosphate buffered saline (Thermo Fisher Scientific #21-040-CV) with 0.04% BSA (Sigma-Aldrich #SRE0036). Cell number and viability were estimated on a Countess II Automated Cell Counter prior to library preparation of 10,000 cells (estimated cell recovery from 16,000 input cells) per sample using Chromium Single Cell 3ʹ GEM, Library & Gel Bead Kit v3 (10X Genomics PN-1000075). Libraries were sequenced (2 × 75 bp) on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples were multiplexed across all lanes. Count matrices were produced from raw sequencing data using the Cell Ranger v3.0.2 package from 10X Genomics (RRID: SCR_017344). Data filtering and preprocessing Matrices from the 10x Cell Ranger platform were filtered and preprocessed using Seurat v3.0.1 (RRID: SCR_016341) 38 . Prior to the generation of Seurat objects, Xist gene counts were eliminated in order to avoid clustering by sex within mixed sample populations. Genes expressed in fewer than 5 cells per sample were removed. Cells with greater than 7.5% or 2% counts from mitochondrial genes or hemoglobin genes, respectively, were removed. Cells with total gene count (nGene) z-scores less than -1 (corresponding to ~700 or fewer detected genes) or greater than 4 (corresponding to ~6000 or greater detected genes) were removed, as were cells with total UMI count (nUMI) z-scores greater than 7 (corresponding to ~50,000 or greater detected UMIs; see Supplementary Fig.  5 ). One chimpanzee ortholog line replicate was removed during pre-processing due to high overall mitochondrial gene expression, indicative of low viability. Prior to data integration, expression values from each sample were normalized based on library size for pre-processing purposes only using the Seurat tool NormalizeData 38 . Louvain clustering as implemented in Seurat was performed for pre-processing purposes only using FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, and FindClusters in order to remove endothelial cell clusters ( Cd34 -positive and Pf4 -positive), clusters characterized by aberrant mitochondrial gene expression (low mt-Co1 ), and transcriptionally distinct clusters containing fewer than 30 cells per sample 38 , 42 . The numbers of cells remaining after pre-processing for each sample are listed in Supplementary Data  13 . Data normalization and integration All subsequent normalization and integration steps after pre-processing were performed with raw counts for all cells retained after pre-processing (see Supplementary Data  13 ). Cell cycle scores were computed using CellCycleScoring in Seurat to regress out the difference between G2M and S phases, effectively preserving differences between cycling and non-cycling cells while reducing differences related to cell cycle amongst proliferating cells 38 . In addition to cell cycle scores, percent mitochondrial gene expression and nUMI values were regressed using SCTransform (SCT) in order to reduce the effects of sequencing depth and minor differences in mitochondrial DNA expression related to viability 38 , 80 . All SCT normalized datasets containing all genes from individual samples were integrated using SelectIntegrationFeatures, PrepSCTIntegration, FindIntegrationAnchors, and IntegrateData 38 , 80 . Following integration, the combined dataset was randomly down-sampled to contain a maximum of 10,000 cells per genotype prior to embedding and clustering using SubsetData in Seurat 38 . PCA, UMAP, and Louvain clustering were implemented in Seurat using RunPCA, RunUMAP, FindNeighbors, and FindClusters 38 , 41 . Percentages of cells belonging to each Louvain cluster are shown in Supplementary Data  13 . Normalized data from all samples combined were used for imputation using ALRA with default settings for the purposes of data visualization as shown in Fig.  4A–D , Supplementary Fig. 4A, B , and Fig. 5C, D 81 . Marker gene expression was compared between ALRA-imputed and unimputed data to establish that imputation did not substantially distort marker gene expression patterns in our dataset (Supplementary Fig.  4 , Supplementary Data  13 ). Data normalization and integration, UMAP embedding, and Louvain clustering were performed prior to imputation. The threshold for identifying Gbx2 -positive cells was set as an imputed Gbx2 expression value greater than 0.1. This threshold was also used for identifying percentages of marker gene-positive cells in unimputed and imputed data as shown in Supplementary Data  13 . All gene expression scaling and centering for visualization purposes was performed on normalized imputed or unimputed data using the Seurat ScaleData function with default parameters (scale.max = 10) 38 . MELD, MAGIC, kNN-DREMI analyses Cells belonging to mesenchymal cell clusters (clusters 1–4, see Fig.  4A, C ) from all genotypes were used for MELD, MAGIC, kNN-DREMI, and Gene Set Enrichment Analysis (GSEA). Scaled data matrices from the Seurat object integrated assay were loaded using scprep for MELD, MAGIC, and kNN-DREMI ( https://github.com/krishnaswamylab/scprep ). MELD and MAGIC both denoise scRNA-seq data using graphs to model cellular state space. The same graph signal was used for both MELD and MAGIC as calculated by graphtools (1.5.2) with n_pca = 20, decay = 40, and knn = 10. MELD was run on one-hot vectors for each genotype independently using default parameters 55 . MAGIC was performed using the same graph signal as MELD 54 . We used the kNN-DREMI implementation provided in scprep and kNN-DREMI was run on MAGIC-imputed data 53 . kNN-DREMI analysis was used in order to identify genes with expression levels associated with either Gbx2 expression in humanized hindlimb or cells with increased humanized RL as calculated using MELD. MAGIC was employed only for the purpose of generating denoised gene expression values for kNN-DREMI analysis of gene-gene relationships but was not used for data visualization, clustering, or sample-associated density estimation using MELD. Gene set enrichment analysis GSEA was performed using topGO v.2.34.0 (RRID: SCR_014798) on all expressed genes that were ranked by Gbx2- DREMI or humanized RL-DREMI score from the aforementioned humanized mesenchymal cell kNN-DREMI analysis 82 . Significant nodes were identified using a Kolmogorov–Smirnov test and the algorithm = “elim” argument. Ontologies listed in Supplementary Data  5 and 6 are the top 30 nodes with fewer than 100 annotated genes (to remove non-specific categories) and at least one gene in the top 20% of DREMI scores. Heatmap hierarchical clustering was performed using pheatmap v1.0.12 (RRID: SCR_016418) 83 . Skeletal staining E18.5 skeletons from two litters from each of HACNS1 homozygous, chimpanzee ortholog line, and wild type homozygous crosses ( n  = 48 embryos) were stained with Alcian Blue and Alizarin Red as previously described 71 . Skeletons were imaged under a stereo microscope (Leica S6D) and measured by a single scorer blinded to genotype using ImageJ 2.0.0. Bone and cartilage lengths of the forelimb and hindlimb pelvic girdle, stylopod, zeugopod, and autopod were measured blinded to genotype using ImageJ 2.0.0. Forelimb measurements include metacarpals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), scapula (bone and cartilage), humerus (bone and cartilage), radius (bone and cartilage), and ulna (bone and cartilage). Hindlimb measurements include metatarsals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), tibia (bone and cartilage), femur (bone and cartilage), pelvis (cartilage), ilium (bone), ischium (bone), pubis (bone), fibula (bone), calcaneum (cartilage), and talus (cartilage). Digit length was calculated as the sum of all metacarpal/metatarsal and phalanx segments. Raw measurements and digit length were normalized to the length of ossified humerus or femur for forelimb or hindlimb digits, respectively. Phalange to metacarpal ratio was calculated as the ratio of the sum of the phalange lengths of each digit to the corresponding metacarpal/metatarsal segment. Interdigital ratios were calculated using raw digit lengths. Raw measurements and images are available as Source Data. ANOVA analysis for gene expression and morphometry ANOVA analysis was performed with the lme4 package in R (RRID: SCR_015654) using default parameters to dissect the effects of genotype on limb segment length (morphometric data) 84 . We calculated the effects of genotype, litter, sex, forelimb versus hindlimb, digit number, and right versus left (RL) on normalized digit length, phalange to metacarpal ratio and interdigital ratio (Length Ratio ~ Genotype * (1  |  Genotype/Litter) * Sex * Limb * Digit * (1  |  RL) * (1  |  Litter/Embryo) * (1  |  Sex/Embryo) * (1  |  Genotype/Embryo)) . Correction for multiple comparisons was performed using the Benjamini & Hochberg method 85 . Statistics and reproducibility All ChIP-seq findings were validated using ChIP-qPCR of both the sequenced samples as well as additional biological replicates. Specificity of H3K27ac and H3K4me2 antibodies was validated by the authors using dot blot analysis. Additional validation measures including dot blot analysis and ChIP-qPCR were performed by Active Motif ( https://www.activemotif.com/documents/tds/39133.pdf and https://www.activemotif.com/documents/tds/39913.pdf ). RT-qPCR results shown in Supplementary Fig.  3C were validated with additional biological and technical replicates. All attempts at replication were successful. No statistical methods were used to predetermine sample size for ChIP-seq and RT-qPCR analyses and no data were excluded from these analyses. All samples prepared for ChIP-seq, RT-qPCR, ISH, scRNA-seq, or morphometric analysis as shown in the final figures were treated identically and processed in parallel. No statistical methods were used to predetermine optimal sample sizes for morphometric and ISH analyses. Instead, morphometric studies and ISH analyses were done using large sample sizes: limb samples from 48 embryos for morphometry and over 100 embryos obtained from multiple litters for each genotype for ISH analyses. For ISH and morphometric analyses, no data were excluded from the analyses; missing data values indicate samples that could not be evaluated/measured due to damage to the specimen. One scRNA-seq replicate from the chimpanzee ortholog line was excluded based on high overall mitochondrial gene expression, indicative of low sample quality based on preestablished filtering metrics. Qualitative analysis of ISH results were performed using a blinded approach by randomizing embryo identification numbers prior to annotation as described in in the Methods. Morphometric data was collected blinded to genotype by randomizing sample identification numbers. ChIP-seq, RT-qPCR, and scRNA-seq were performed without group allocation blinding as all biological and technical replicates were processed identically and in parallel and no qualitative analyses were required for these experiments. We did not consider the sex of embryonic samples as a variable in our studies. Reporting summary Further information on research design is available in the Nature Research Reporting Summary linked to this article. \ No newline at end of file