-
Notifications
You must be signed in to change notification settings - Fork 0
/
part2.py
36 lines (27 loc) · 1.72 KB
/
part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from Bio import AlignIO
from collections import Counter
import pandas as pd
# Read the MSA from the file
msa = AlignIO.read("(20seq msa).aln-clustal_num", "clustal")
# Convert the MSA to a list of strings
msa_sequences = [str(record.seq) for record in msa]
# Split the sequences into Omicron and Delta
omicron_sequences = msa_sequences[:10]
delta_sequences = msa_sequences[10:]
# Define a function to calculate the average nucleotide percentages and CG content for a list of sequences
def calculate_averages(sequences):
# Calculate the frequency of each nucleotide and the CG content for each sequence
nucleotide_freqs = [Counter(seq) for seq in sequences]
nucleotide_percents = [{nucleotide: count / len(seq) * 100 for nucleotide, count in freq.items()} for seq, freq in zip(sequences, nucleotide_freqs)]
cg_contents = [(freq.get('C', 0) + freq.get('G', 0)) / len(seq) * 100 for seq, freq in zip(sequences, nucleotide_freqs)]
# Calculate the average percentage of each nucleotide and the average CG content
average_nucleotide_percent = {nucleotide: sum(percent[nucleotide] for percent in nucleotide_percents if nucleotide in percent) / len(nucleotide_percents) for nucleotide in ('A', 'C', 'G', 'T')}
average_cg_content = sum(cg_contents) / len(cg_contents)
return average_nucleotide_percent, average_cg_content
# Calculate the averages for Omicron and Delta
omicron_averages = calculate_averages(omicron_sequences)
delta_averages = calculate_averages(delta_sequences)
# Create a DataFrame to store the averages
df = pd.DataFrame([omicron_averages, delta_averages], index=['Omicron', 'Delta'], columns=['Average Nucleotide Percent', 'Average CG Content'])
# Save the DataFrame to a CSV file
df.to_csv("averages.csv")