Skip to content

Commit ab91e9a

Browse files
committed
add FASTA analyzer page with automatic sequence detection
1 parent 23f2f63 commit ab91e9a

File tree

1 file changed

+243
-0
lines changed

1 file changed

+243
-0
lines changed

content/fasta_analyzer.py

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
"""
2+
FASTA Analyzer Page
3+
4+
This module provides functionality for analyzing FASTA files to compute
5+
sequence length histograms and residue frequency statistics. Supports
6+
both protein and nucleotide (DNA/RNA) sequences.
7+
"""
8+
9+
import streamlit as st
10+
import pandas as pd
11+
import plotly.express as px
12+
import plotly.graph_objects as go
13+
import sys
14+
from pathlib import Path
15+
16+
# Add utils to path
17+
sys.path.append(str(Path(__file__).parent.parent))
18+
19+
from utils.fasta_analyzer import analyze_fasta
20+
21+
def main():
22+
"""Main function for the FASTA Analyzer page."""
23+
st.title("FASTA Analyzer")
24+
25+
st.markdown("""
26+
**Analyze FASTA sequence files** to compute statistics about sequence lengths and residue composition.
27+
28+
This tool helps you:
29+
- **Understand sequence datasets** by visualizing length distributions
30+
- **Analyze residue composition** with frequency statistics for amino acids or nucleotides
31+
- **Calculate GC content** for nucleotide sequences
32+
""")
33+
34+
with st.expander("How FASTA Analysis Works"):
35+
st.markdown("""
36+
**Residue Frequencies:**
37+
- For proteins: counts of the 20 standard amino acids (ACDEFGHIKLMNPQRSTVWY)
38+
- For DNA: counts of A, C, G, T (plus N for ambiguous)
39+
- For RNA: counts of A, C, G, U (plus N for ambiguous)
40+
41+
**Length Statistics:**
42+
- Histogram showing distribution of sequence lengths
43+
- Summary statistics: min, max, mean, median lengths
44+
""")
45+
46+
# Input section
47+
st.subheader("Input Parameters")
48+
49+
# File uploader (outside form since file uploaders don't work well inside forms)
50+
uploaded_file = st.file_uploader(
51+
"Upload FASTA file",
52+
type=["fasta", "fa", "faa", "fna"],
53+
help="Upload a FASTA file (.fasta, .fa, .faa, .fna)",
54+
)
55+
56+
# Analyze button
57+
submit = st.button("Analyze Sequences", type="primary")
58+
59+
# Process submission
60+
if submit:
61+
if uploaded_file is None:
62+
st.error("Please upload a FASTA file to analyze.")
63+
return
64+
65+
# Read file content
66+
try:
67+
fasta_input = uploaded_file.read().decode("utf-8")
68+
except UnicodeDecodeError:
69+
st.error("Could not read file. Please ensure it is a valid text file.")
70+
return
71+
72+
if not fasta_input.strip():
73+
st.error("Please provide FASTA sequences to analyze.")
74+
return
75+
76+
# Perform analysis
77+
with st.spinner("Analyzing sequences..."):
78+
try:
79+
results = analyze_fasta(fasta_input, "auto")
80+
except Exception as e:
81+
st.error(f"Error parsing FASTA: {str(e)}")
82+
return
83+
84+
if not results["success"]:
85+
st.error(results["error"])
86+
return
87+
88+
# Display results
89+
st.success(f"Successfully analyzed {results['total_sequences']} sequence(s)")
90+
91+
# Summary metrics
92+
st.subheader("Summary Statistics")
93+
col1, col2, col3 = st.columns(3)
94+
95+
with col1:
96+
st.metric("Total Sequences", results["total_sequences"])
97+
with col2:
98+
st.metric("Total Residues", f"{results['length_stats']['total_residues']:,}")
99+
with col3:
100+
st.metric("Avg Length", f"{results['length_stats']['mean']:.1f}")
101+
102+
# Additional length stats
103+
col5, col6, col7 = st.columns(3)
104+
with col5:
105+
st.metric("Min Length", results["length_stats"]["min"])
106+
with col6:
107+
st.metric("Max Length", results["length_stats"]["max"])
108+
with col7:
109+
st.metric("Median Length", results["length_stats"]["median"])
110+
111+
# Sequence length histogram
112+
st.subheader("Sequence Length Distribution")
113+
114+
length_data = pd.DataFrame({
115+
"Header": results["length_stats"]["headers"],
116+
"Length": results["length_stats"]["lengths"],
117+
})
118+
119+
if len(length_data) > 1:
120+
# Calculate number of bins based on max bin width of 100
121+
length_range = length_data["Length"].max() - length_data["Length"].min()
122+
nbins = max(1, int(length_range / 100)) if length_range > 0 else 1
123+
fig_hist = px.histogram(
124+
length_data,
125+
x="Length",
126+
nbins=nbins,
127+
title="Distribution of Sequence Lengths",
128+
labels={"Length": "Sequence Length (residues)", "count": "Count"},
129+
)
130+
fig_hist.update_layout(
131+
showlegend=False,
132+
xaxis_title="Sequence Length (residues)",
133+
yaxis_title="Number of Sequences",
134+
)
135+
st.plotly_chart(fig_hist, use_container_width=True)
136+
else:
137+
st.info(f"Single sequence with length: {length_data['Length'].iloc[0]}")
138+
139+
# Residue frequency analysis
140+
st.subheader("Residue Frequency Analysis")
141+
142+
freq_data = results["residue_frequencies"]
143+
residue_df = pd.DataFrame({
144+
"Residue": list(freq_data["counts"].keys()),
145+
"Count": list(freq_data["counts"].values()),
146+
"Percentage": [f"{p:.2f}%" for p in freq_data["percentages"].values()],
147+
})
148+
residue_df = residue_df.sort_values("Count", ascending=False)
149+
150+
# Bar chart
151+
fig_bar = px.bar(
152+
residue_df,
153+
x="Residue",
154+
y="Count",
155+
title=f"Residue Frequencies ({freq_data['seq_type'].upper()})",
156+
labels={"Residue": "Residue", "Count": "Count"},
157+
)
158+
fig_bar.update_layout(xaxis_tickangle=0)
159+
st.plotly_chart(fig_bar, use_container_width=True)
160+
161+
# Frequency table
162+
col_table, col_chart = st.columns([1, 1])
163+
164+
with col_table:
165+
st.markdown("**Residue Counts**")
166+
st.dataframe(
167+
residue_df,
168+
use_container_width=True,
169+
hide_index=True,
170+
)
171+
172+
with col_chart:
173+
# Pie chart for top residues
174+
top_residues = residue_df.head(10)
175+
fig_pie = px.pie(
176+
top_residues,
177+
values="Count",
178+
names="Residue",
179+
title="Top 10 Residues",
180+
)
181+
st.plotly_chart(fig_pie, use_container_width=True)
182+
183+
# Download section
184+
st.subheader("Download Results")
185+
186+
col_dl1, col_dl2 = st.columns(2)
187+
188+
# Prepare summary data
189+
summary_df = pd.DataFrame({
190+
"Metric": [
191+
"Total Sequences",
192+
"Total Residues",
193+
"Average Length",
194+
"Min Length",
195+
"Max Length",
196+
"Median Length",
197+
],
198+
"Value": [
199+
results["total_sequences"],
200+
results["length_stats"]["total_residues"],
201+
f"{results['length_stats']['mean']:.2f}",
202+
results["length_stats"]["min"],
203+
results["length_stats"]["max"],
204+
results["length_stats"]["median"],
205+
],
206+
})
207+
208+
with col_dl1:
209+
# Combined results as TSV
210+
tsv_parts = [
211+
"# FASTA Analysis Summary",
212+
summary_df.to_csv(sep="\t", index=False),
213+
"\n# Residue Frequencies",
214+
residue_df.to_csv(sep="\t", index=False),
215+
]
216+
tsv_data = "\n".join(tsv_parts)
217+
218+
st.download_button(
219+
label="Download as TSV",
220+
data=tsv_data,
221+
file_name="fasta_analysis_results.tsv",
222+
mime="text/tab-separated-values",
223+
)
224+
225+
with col_dl2:
226+
# CSV version
227+
csv_parts = [
228+
"# FASTA Analysis Summary",
229+
summary_df.to_csv(index=False),
230+
"\n# Residue Frequencies",
231+
residue_df.to_csv(index=False),
232+
]
233+
csv_data = "\n".join(csv_parts)
234+
235+
st.download_button(
236+
label="Download as CSV",
237+
data=csv_data,
238+
file_name="fasta_analysis_results.csv",
239+
mime="text/csv",
240+
)
241+
242+
243+
main()

0 commit comments

Comments
 (0)