NaviDiv/app.py at main · mohammedazzouzi15/NaviDiv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""Streamlit web application for molecular diversity analysis.

This module provides a user-friendly interface for molecular diversity analysis
using various scoring functions including frequency-based, similarity-based,
and cluster-based approaches.
"""

from pathlib import Path
from typing import Any

import plotly.io as pio
import streamlit as st
from rdkit import RDLogger

from navidiv.app_utils.action_func import do_tsne, run_all_scorers, run_scorer
from navidiv.app_utils.description import (
    create_scoring_info_section,
)
from navidiv.app_utils.file_name_registry import initiate_file_name_registry
from navidiv.app_utils.plot_generated_molecules import (
    get_data_from_file,
    plot_generated_molecules,
)
from navidiv.app_utils.plot_results import plot_results, plot_step_results
from navidiv.utils import get_smiles_column

RDLogger.logger().setLevel(RDLogger.ERROR)

# Create a global registry instance (or inject as needed)
file_name_registry = initiate_file_name_registry()


def initialize_app() -> bool:
    """Initialize the Streamlit app settings."""
    st.set_page_config(
        page_title="NaviDiv - Molecular Diversity Analysis",
        layout="wide",
        initial_sidebar_state="auto",
        page_icon="🧬",
    )
    pio.templates.default = "plotly"

    # Add main title and description
    st.title("🧬 NaviDiv - Molecular Diversity Analysis")
    st.markdown("""
    **A comprehensive tool for analyzing molecular diversity in datasets.**

    Upload your CSV file containing SMILES strings to start exploring
    molecular diversity through various scoring methods, t-SNE visualization,
    and fragment analysis.
    """)
    st.divider()
    return True


def on_change_file_path() -> None:
    """Handle file path input changes."""
    st.session_state.file_path = st.session_state.file_path_input


def load_file_section() -> str:
    """Handle file loading section and return the file path."""
    st.markdown("### 📁 Load Your Dataset")

    with st.container():
        col_loading = st.columns([3, 1])
        with col_loading[0]:
            val = st.text_input(
                "📄 Enter path to your CSV file containing SMILES data",
                key="file_path_input",
                on_change=on_change_file_path,
                placeholder="/media/mohammed/Work/Navi_diversity/tests/test_data/default/default_1_TSNE.csv",
                help="CSV should contain SMILES strings and optionally "
                "'step' and 'Score' columns for analysis",
            )
        with col_loading[1]:
            if st.button(
                "📂 Load File",
                type="primary",
                help="Load and validate the CSV file",
            ):
                if not val:
                    st.error("❌ Please enter a valid file path.")
                else:
                    try:
                        # Basic validation
                        file_path = Path(val)
                        if not file_path.exists():
                            st.error(f"❌ File not found: {val}")
                        elif file_path.suffix.lower() != ".csv":
                            st.warning("⚠️ File should be a CSV (.csv)")
                        else:
                            st.session_state.file_path = val
                            st.success(f"✅ File loaded: {file_path.name}")
                    except OSError as e:
                        st.error(f"❌ Error loading file: {e}")

    return val

def sidebar_analysis(file_path):
        # Analysis buttons in sidebar
    do_tsne(file_path)
    run_all_scorers(file_path)

    run_scorer(file_path)


def create_analysis_tools_section(file_path: str) -> None:
    """Create the analysis tools section."""
    st.markdown("### 🔬Chemical space:")

    # Main visualization
    try:
        filtered_data, x_column_2, y_column_2, hue_column_2 = (
            get_data_from_file(file_path)
        )

        # Tabs for different views
        tab_all, tab_frag = st.tabs(
            [
                "🧬 All Molecules",
                "🎯 Fragment Analysis",
            ]
        )

        with tab_all:
            st.markdown(
                "**All Molecules View:** Comprehensive visualization of "
                "all molecules in your dataset."
            )
            plot_generated_molecules(
                filtered_data,
                symbol_column=None,
                x_column=x_column_2,
                y_column=y_column_2,
                hue_column=hue_column_2,
                key="molecules_all",
            )

        with tab_frag:
            st.markdown(
                "**Fragment Analysis:** Focused view on molecules containing "
                "specific structural fragments."
            )
            if (
                hasattr(
                    st.session_state, "list_of_molecules_containing_fragment"
                )
                and st.session_state.list_of_molecules_containing_fragment
            ):
                filtered_data["Molecules containing fragment"] = filtered_data[
                    get_smiles_column(filtered_data)
                ].apply(
                    lambda x: x
                    in st.session_state.list_of_molecules_containing_fragment
                )
                plot_generated_molecules(
                    filtered_data,
                    symbol_column="Molecules containing fragment",
                    x_column=x_column_2,
                    y_column=y_column_2,
                    hue_column=hue_column_2,
                    key="molecules_frag",
                )
            else:
                st.info(
                    "🔍 No fragment selection available. "
                    "Run fragment analysis first."
                )

    except Exception as e:
        st.error(f"❌ Error processing data: {e}")
        st.info("💡 Please ensure your CSV contains valid SMILES strings.")


def create_results_section(col3: Any) -> None:
    """Create the results analysis section."""
    st.markdown("### 📊 Analysis Results")

    tab_per_fragment, tab_per_step = st.tabs(
        ["🧩 Per Fragment", "📈 Per Step"]
    )

    with tab_per_fragment:
        if hasattr(st.session_state, "output_path"):
            st.markdown(
                "**Fragment Occurrence Analysis:** Shows how frequently "
                "different molecular fragments appear in your dataset."
            )

            output_path = Path(st.session_state.output_path)
            if not output_path.exists():
                output_path.mkdir(parents=True, exist_ok=True)

            csv_files = list(output_path.glob("*/group*.csv"))
            csv_files = [f.relative_to(output_path) for f in csv_files]

            if csv_files:
                file_path_results = st.selectbox(
                    "Select Fragment Analysis Results",
                    csv_files,
                    format_func=lambda x: file_name_registry.get_display_name(
                        x.parent.name
                    )
                    if isinstance(x, Path)
                    else x,
                    help="Choose which fragment analysis results to display",
                )
                if file_path_results:
                    plot_results(f"{output_path}/{file_path_results}", col3)
            else:
                st.info(
                    "🔄 No fragment results available yet. "
                    "Run 'All Scorers' to generate analysis."
                )

    with tab_per_step:
        if hasattr(st.session_state, "output_path"):
            st.markdown(
                "**Evolution Analysis:** Displays the evolution of diversity "
                "metrics across generation steps."
            )

            output_path = Path(st.session_state.output_path)
            if not output_path.exists():
                output_path.mkdir(parents=True, exist_ok=True)

            csv_files = list(output_path.glob("*/step_*.csv"))
            csv_files = [f.relative_to(output_path) for f in csv_files]

            if csv_files:
                file_path_results = st.selectbox(
                    "Select Step Evolution Results",
                    csv_files,
                    key="file_path_results",
                    format_func=lambda x: file_name_registry.get_display_name(
                        x.parent.name
                    )
                    if isinstance(x, Path)
                    else x,
                    help="Choose which step-wise analysis results to display",
                )
                if file_path_results:
                    plot_step_results(f"{output_path}/{file_path_results}")
            else:
                st.info(
                    "🔄 No step results available yet. "
                    "Run 'All Scorers' to generate analysis."
                )


def main() -> None:
    """Main entry point for the Streamlit app."""
    initialised = initialize_app()

    if not initialised:
        return
    # Add information section
    create_scoring_info_section()

    # File loading section
    val = load_file_section()
    st.divider()

    # Main analysis layout
    col1, col2, col3 = st.columns([2, 2, 1])

    if val and hasattr(st.session_state, "file_path"):
        # Left column - Analysis tools
        sidebar_analysis(st.session_state.file_path)

        with col1:
            create_analysis_tools_section(st.session_state.file_path)
    else:
        with col1:
            st.info("👆 **Getting Started:** Load your CSV file above.")
            st.markdown("""
            **Requirements:**
            - 📊 CSV file with SMILES strings
            - 📈 Optional: 'step' and 'Score' columns for evolution analysis
            - 🧪 Recommended: At least 100+ molecules for diversity analysis
            """)

    # Right columns - Results
    with col2:
        create_results_section(col3)


if __name__ == "__main__":
    main()