Bioinformatics which is also called Biological Data Science is an awesome and fascinating field of science.
In this tutorial we will build a simple app using Streamlit to do some basic sequence analysis and dot plot of two DNA sequences.
We will be using BioPython and NeatBio to do our processing of our sequence and then utilize Streamlit for our UI.
Let us first install our necessary packages
pip install biopython neatbio streamlit
The Structure of our App
Our simple app will have two main sections
- DNA Sequence: where we analyse our sequence from a FASTA file
- DotPlot: where we compare two different sequences and generate a dotplot
In designing the first section for the DNA Sequence we will use the file_uploader or drag and drop feature of streamlit and then process our FASTA file using BioPython and NeatBio. We will be restricting the files to only FASTA files that is files with .fasta or .fa extensions.
To get a more elaborate tutorial on BioPython you can check out this course.
We will then do the basics such as
- DNA Composition (GC content,AT content)
- Transcription
- Translation
- Plot our nucleotide frequency
- Amino acid frequency
In the second part will compare two sequences and generate a dotplot for them. In generating a dotplot will be using a custom function as well as matplotlib to produce a beautiful dotplot.
Below is the entire code for the app.
import streamlit as st
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")
from Bio.Seq import Seq
from Bio import SeqIO
from collections import Counter
import neatbio.sequtils as utils
import numpy as np
from PIL import Image
def delta(x,y):
return 0 if x == y else 1
def M(seq1,seq2,i,j,k):
return sum(delta(x,y) for x,y in zip(seq1[i:i+k],seq2[j:j+k]))
def makeMatrix(seq1,seq2,k):
n = len(seq1)
m = len(seq2)
return [[M(seq1,seq2,i,j,k) for j in range(m-k+1)] for i in range(n-k+1)]
def plotMatrix(M,t, seq1, seq2, nonblank = chr(0x25A0), blank = ' '):
print(' |' + seq2)
print('-'*(2 + len(seq2)))
for label,row in zip(seq1,M):
line = ''.join(nonblank if s < t else blank for s in row)
print(label + '|' + line)
def dotplot(seq1,seq2,k = 1,t = 1):
M = makeMatrix(seq1,seq2,k)
plotMatrix(M, t, seq1,seq2) #experiment with character choice
# Convert to Fxn
def dotplotx(seq1,seq2):
plt.imshow(np.array(makeMatrix(seq1,seq2,1)))
# on x-axis list all sequences of seq 2
xt=plt.xticks(np.arange(len(list(seq2))),list(seq2))
# on y-axis list all sequences of seq 1
yt=plt.yticks(np.arange(len(list(seq1))),list(seq1))
plt.show()
def gc_content(seq):
result = float(str(seq).count('G') + str(seq).count('C'))/len(seq) * 100
return result
def at_content(seq):
result = float(str(seq).count('A') + str(seq).count('T'))/len(seq) * 100
return result
def main():
"""A Simple Streamlit App """
st.title("BioInformatics App")
activity = ['Intro','DNA','DotPlot',"About"]
choice = st.sidebar.selectbox("Select Activity",activity)
if choice == 'Intro':
st.subheader("Intro")
elif choice == "DNA Sequence":
st.subheader("DNA Sequence Analysis")
seq_file = st.file_uploader("Upload FASTA File",type=["fasta","fa"])
if seq_file is not None:
dna_record = SeqIO.read(seq_file,"fasta")
# st.write(dna_record)
dna_seq = dna_record.seq
details = st.radio("Details",("Description","Sequence"))
if details == "Description":
st.write(dna_record.description)
elif details == "Sequence":
st.write(dna_record.seq)
# Nucleotide Frequencies
st.subheader("Nucleotide Frequency")
dna_freq = Counter(dna_seq)
st.write(dna_freq)
adenine_color = st.beta_color_picker("Adenine Color")
thymine_color = st.beta_color_picker("thymine Color")
guanine_color = st.beta_color_picker("Guanine Color")
cytosil_color = st.beta_color_picker("cytosil Color")
if st.button("Plot Freq"):
barlist = plt.bar(dna_freq.keys(),dna_freq.values())
barlist[2].set_color(adenine_color)
barlist[3].set_color(thymine_color)
barlist[1].set_color(guanine_color)
barlist[0].set_color(cytosil_color)
st.pyplot()
st.subheader("DNA Composition")
gc_score = utils.gc_content(str(dna_seq))
at_score = utils.at_content(str(dna_seq))
st.json({"GC Content":gc_score,"AT Content":at_score})
# Nucleotide Count
nt_count = st.text_input("Enter Nucleotide Here","Type Nucleotide Alphabet")
st.write("Number of {} Nucleotide is ::{}".format((nt_count),str(dna_seq).count(nt_count)))
# Protein Synthesis
st.subheader("Protein Synthesis")
p1 = dna_seq.translate()
aa_freq = Counter(str(p1))
if st.checkbox("Transcription"):
st.write(dna_seq.transcribe())
elif st.checkbox("Translation"):
st.write(dna_seq.translate())
elif st.checkbox("Complement"):
st.write(dna_seq.complement())
elif st.checkbox("AA Frequency"):
st.write(aa_freq)
elif st.checkbox("Plot AA Frequency"):
aa_color = st.beta_color_picker("Pick An Amino Acid Color")
# barlist = plt.bar(aa_freq.keys(),aa_freq.values(),color=aa_color)
# barlist[2].set_color(aa_color)
plt.bar(aa_freq.keys(),aa_freq.values(),color=aa_color)
st.pyplot()
elif st.checkbox("Full Amino Acid Name"):
aa_name = str(p1).replace("*","")
aa3 = utils.convert_1to3(aa_name)
st.write(aa_name)
st.write("=====================")
st.write(aa3)
st.write("=====================")
st.write(utils.get_acid_name(aa3))
# Top Most Common Amino
elif choice == "DotPlot":
st.subheader("Generate Dot Plot For Two Sequences")
seq_file1 = st.file_uploader("Upload 1st FASTA File",type=["fasta","fa"])
seq_file2 = st.file_uploader("Upload 2nd FASTA File",type=["fasta","fa"])
if seq_file1 and seq_file2 is not None:
dna_record1 = SeqIO.read(seq_file1,"fasta")
dna_record2 = SeqIO.read(seq_file2,"fasta")
# st.write(dna_record)
dna_seq1 = dna_record1.seq
dna_seq2 = dna_record2.seq
details = st.radio("Details",("Description","Sequence"))
if details == "Description":
st.write(dna_record1.description)
st.write("=====================")
st.write(dna_record2.description)
elif details == "Sequence":
st.write(dna_record1.seq)
st.write("=====================")
st.write(dna_record2.seq)
cus_limit = st.number_input("Select Max number of Nucleotide",10,200,50)
if st.button("Dot Plot"):
st.write("Comparing the first {} Nucleotide of the Two Sequences".format(cus_limit))
dotplotx(dna_seq1[0:cus_limit],dna_seq2[0:cus_limit])
st.pyplot()
elif choice == "About":
st.subheader("About")
if __name__ == '__main__':
main()
You can also check out the entire video tutorial below
Thanks for watching
Jesus Saves
By Jesse E. Agbe(JCharis)
I love your blog.. very nice colors & theme. Did you create this website yourself or did you hire someone to do it for you? Plz respond as I’m looking to create my own blog and would like to know where u got this from. thank you
Thanks for the kind words. We did it ourselves.