diff --git a/q2_ms/plugin_setup.py b/q2_ms/plugin_setup.py index cc07c21..f678f6a 100644 --- a/q2_ms/plugin_setup.py +++ b/q2_ms/plugin_setup.py @@ -7,7 +7,9 @@ # ---------------------------------------------------------------------------- import importlib +from q2_types.metadata import ImmutableMetadata from q2_types.sample_data import SampleData +from qiime2.core.type import Choices, Properties, Str, TypeMap from qiime2.plugin import Citations, Metadata, Plugin from q2_ms import __version__ @@ -38,6 +40,7 @@ mzMLFormat, ) from q2_ms.xcms.database import fetch_massbank +from q2_ms.xcms.metadata import create_spectral_metadata from q2_ms.xcms.read_ms_experiment import read_ms_experiment citations = Citations.load("citations.bib", package="q2_ms") @@ -67,6 +70,58 @@ citations=[], ) +P_ms_level, I_xcms_experiment, _ = TypeMap( + { + (Str % Choices(["1"]), XCMSExperiment): ImmutableMetadata, + (Str % Choices(["1"]), XCMSExperiment % Properties("peaks")): ImmutableMetadata, + ( + Str % Choices(["1"]), + XCMSExperiment % Properties("features"), + ): ImmutableMetadata, + (Str % Choices(["1"]), XCMSExperiment % Properties("MS2")): ImmutableMetadata, + ( + Str % Choices(["1"]), + XCMSExperiment % Properties("MS2", "peaks"), + ): ImmutableMetadata, + ( + Str % Choices(["1"]), + XCMSExperiment % Properties("MS2", "features"), + ): ImmutableMetadata, + (Str % Choices(["2"]), XCMSExperiment % Properties("MS2")): ImmutableMetadata, + ( + Str % Choices(["2"]), + XCMSExperiment % Properties("MS2", "peaks"), + ): ImmutableMetadata, + ( + Str % Choices(["2"]), + XCMSExperiment % Properties("MS2", "features"), + ): ImmutableMetadata, + } +) + + +plugin.methods.register_function( + function=create_spectral_metadata, + inputs={"xcms_experiment": I_xcms_experiment}, + outputs=[("spectral_metadata", ImmutableMetadata)], + parameters={"ms_level": P_ms_level}, + input_descriptions={"xcms_experiment": "XCMSExperiment."}, + output_descriptions={"spectral_metadata": "Spectral metadata of all MS1 scans."}, + parameter_descriptions={ + "ms_level": "If the spectral metadata should be created for MS1 or MS2 scans." + }, + name="Create spectral metadata", + description=( + "This action creates a spectral metadata table from a XCMSExperiment artifact. " + "This metadata can be used to plot total ion chromatograms or base peak " + "chromatograms and other line and box plots with q2-vizard.\n\nNOTE:\nThe data " + "gets filtered by MS level and only MS1 scans are retained. Also the name of " + "the column defining the sample id in the sample data will get '_' added as a " + "suffix." + ), + citations=[], +) + plugin.pipelines.register_function( function=read_ms_experiment, inputs={"spectra": SampleData[mzML]}, diff --git a/q2_ms/xcms/metadata.py b/q2_ms/xcms/metadata.py new file mode 100644 index 0000000..09a46ee --- /dev/null +++ b/q2_ms/xcms/metadata.py @@ -0,0 +1,88 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2025, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os + +import pandas as pd +import qiime2 +from qiime2.metadata.base import is_id_header + +from q2_ms.types import XCMSExperimentDirFmt + + +def create_spectral_metadata( + xcms_experiment: XCMSExperimentDirFmt, ms_level: str = "1" +) -> qiime2.Metadata: + # Read the backend data file while skipping the first line + backend_df = pd.read_csv( + os.path.join(str(xcms_experiment), "ms_backend_data.txt"), + sep="\t", + skiprows=1, + index_col=0, + ) + + # Read the sample data file + sample_df = pd.read_csv( + os.path.join(str(xcms_experiment), "ms_experiment_sample_data.txt"), sep="\t" + ) + sample_df.drop(columns=["spectraOrigin"], inplace=True) + + # Read the links file that maps sample indices to spectra indices. + links_df = pd.read_csv( + os.path.join( + str(xcms_experiment), "ms_experiment_sample_data_links_spectra.txt" + ), + sep="\t", + header=None, + names=["sample_id", "spectra_index"], + ) + + # Merge the backend data with the links data using the spectra_index. + merged_df = backend_df.join(links_df.set_index("spectra_index"), how="left") + + # Merge the merged_df with the sample_df using the sample_id. + merged_df = merged_df.join(sample_df, on="sample_id") + + # Create new column that shows the difference between rtime and the adjusted rtime + if "rtime_adjusted" in merged_df.columns: + merged_df["rtime_adjusted-rtime"] = ( + merged_df["rtime_adjusted"] - merged_df["rtime"] + ) + + # Set index to str + merged_df.index.name = "id" + merged_df.index = merged_df.index.astype(str) + + # Set column centroided to str + if "centroided" in merged_df.columns: + merged_df["centroided"] = merged_df["centroided"].astype(str) + + # Adds "_" to column name if it is a reserved metadata index name + merged_df.columns = [ + col + "_" if is_id_header(col) else col for col in merged_df.columns + ] + + # Filter data by MS level + merged_df = merged_df.loc[merged_df["msLevel"] == int(ms_level)] + + if ms_level == "1": + # Drop columns that only contain information about MS2 scans + columns_to_drop = [ + "precScanNum", + "precursorMz", + "precursorIntensity", + "precursorCharge", + "collisionEnergy", + "isolationWindowLowerMz", + "isolationWindowTargetMz", + "isolationWindowUpperMz", + ] + merged_df.drop( + columns=[col for col in columns_to_drop if col in merged_df.columns] + ) + + return qiime2.Metadata(merged_df) diff --git a/q2_ms/xcms/tests/data/metadata/ms_backend_data.txt b/q2_ms/xcms/tests/data/metadata/ms_backend_data.txt new file mode 100644 index 0000000..b1b19e8 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata/ms_backend_data.txt @@ -0,0 +1,6 @@ +# MsBackendMzR +"msLevel" "rtime" "acquisitionNum" "dataOrigin" "centroided" "polarity" "precScanNum" "precursorMz" "precursorIntensity" "precursorCharge" "collisionEnergy" "isolationWindowLowerMz" "isolationWindowTargetMz" "isolationWindowUpperMz" "peaksCount" "totIonCurrent" "basePeakMZ" "basePeakIntensity" "ionisationEnergy" "lowMZ" "highMZ" "injectionTime" "filterString" "spectrumId" "scanWindowLowerLimit" "scanWindowUpperLimit" "rtime_adjusted" "dataStorage" "scanIndex" +"1" 1 0.106081408 1 "path/sample_1.mzML" TRUE 0 NA NA NA NA NA NA NA NA 909 19614582 197.807357788086 2139308 0 61.9880981445313 755.029235839844 50 "FTMS - c ESI Full ms [60.0000-900.0000]" "controllerType=0 controllerNumber=1 scan=1" 60 900 0.106081408 "path/sample_1.mzML" 1 +"2" 1 0.536452816 2 "path/sample_2.mzML" TRUE 0 NA NA NA NA NA NA NA NA 968 20253388 197.807403564453 2154304.5 0 61.9871025085449 665.043334960938 50 "FTMS - c ESI Full ms [60.0000-900.0000]" "controllerType=0 controllerNumber=1 scan=2" 60 900 0.536452816 "path/sample_2.mzML" 2 +"3" 2 0.613772143999998 3 "path/sample_3.mzML" TRUE 0 1 203.081527709961 811281.937988281 0 30 202.481527709961 203.081527709961 203.681527709961 8 73490.75 155.461517333984 9954.2333984375 0 66.4943084716797 155.461517333984 22 "FTMS - c ESI d Full ms2 203.0815@hcd30.00 [52.0000-214.0000]" "controllerType=0 controllerNumber=1 scan=3" 52 214 0.613772143999998 "path/sample_3.mzML" 3 +"4" 2 0.719534496 4 "path/sample_4.mzML" TRUE 0 1 219.84489440918 152150.111816406 1 30 219.24489440918 219.84489440918 220.44489440918 4 74733 219.844863891602 46147.5546875 0 90.0349273681641 219.844863891602 22 "FTMS - c ESI d Full ms2 219.8449@hcd30.00 [53.0000-230.0000]" "controllerType=0 controllerNumber=1 scan=4" 53 230 0.719534496 "path/sample_4.mzML" 4 diff --git a/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data.txt b/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data.txt new file mode 100644 index 0000000..8d8c342 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data.txt @@ -0,0 +1,5 @@ +"sample_name" "sample_group" "sample_type" "spectraOrigin" +"1" "sample_1" "KO" "QC" "path/sample_1.mzML" +"2" "sample_2" "KO" "study" "path/sample_2.mzML" +"3" "sample_3" "WT" "QC" "path/sample_3.mzML" +"4" "sample_4" "WT" "study" "path/sample_4.mzML" diff --git a/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data_links_spectra.txt b/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data_links_spectra.txt new file mode 100644 index 0000000..a7c79bd --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata/ms_experiment_sample_data_links_spectra.txt @@ -0,0 +1,4 @@ +1 1 +2 2 +3 3 +4 4 diff --git a/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1.tsv b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1.tsv new file mode 100644 index 0000000..5ac18d7 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1.tsv @@ -0,0 +1,3 @@ +id msLevel rtime acquisitionNum dataOrigin centroided polarity precScanNum precursorMz precursorIntensity precursorCharge collisionEnergy isolationWindowLowerMz isolationWindowTargetMz isolationWindowUpperMz peaksCount totIonCurrent basePeakMZ basePeakIntensity ionisationEnergy lowMZ highMZ injectionTime filterString spectrumId scanWindowLowerLimit scanWindowUpperLimit rtime_adjusted dataStorage scanIndex sample_id sample_name_ sample_group sample_type rtime_adjusted-rtime +1 1 0.106081408 1 path/sample_1.mzML True 0 909 19614582.0 197.807357788086 2139308.0 0 61.9880981445313 755.029235839844 50 FTMS - c ESI Full ms [60.0000-900.0000] controllerType=0 controllerNumber=1 scan=1 60 900 0.106081408 path/sample_1.mzML 1 1 sample_1 KO QC 0.0 +2 1 0.536452816 2 path/sample_2.mzML True 0 968 20253388.0 197.807403564453 2154304.5 0 61.9871025085449 665.043334960938 50 FTMS - c ESI Full ms [60.0000-900.0000] controllerType=0 controllerNumber=1 scan=2 60 900 0.536452816 path/sample_2.mzML 2 2 sample_2 KO study 0.0 diff --git a/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1_no_centroided_rt_adjusted.tsv b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1_no_centroided_rt_adjusted.tsv new file mode 100644 index 0000000..0e9ff18 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms1_no_centroided_rt_adjusted.tsv @@ -0,0 +1,3 @@ +id msLevel rtime acquisitionNum dataOrigin polarity precScanNum precursorMz precursorIntensity precursorCharge collisionEnergy isolationWindowLowerMz isolationWindowTargetMz isolationWindowUpperMz peaksCount totIonCurrent basePeakMZ basePeakIntensity ionisationEnergy lowMZ highMZ injectionTime filterString spectrumId scanWindowLowerLimit scanWindowUpperLimit dataStorage scanIndex sample_id sample_name_ sample_group sample_type +1 1.0 0.106081408 1.0 path/sample_1.mzML 0.0 909.0 19614582.0 197.807357788086 2139308.0 0.0 61.9880981445313 755.029235839844 50.0 FTMS - c ESI Full ms [60.0000-900.0000] controllerType=0 controllerNumber=1 scan=1 60.0 900.0 path/sample_1.mzML 1.0 1.0 sample_1 KO QC +2 1.0 0.536452816 2.0 path/sample_2.mzML 0.0 968.0 20253388.0 197.807403564453 2154304.5 0.0 61.9871025085449 665.043334960938 50.0 FTMS - c ESI Full ms [60.0000-900.0000] controllerType=0 controllerNumber=1 scan=2 60.0 900.0 path/sample_2.mzML 2.0 2.0 sample_2 KO study diff --git a/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms2.tsv b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms2.tsv new file mode 100644 index 0000000..6f8c75f --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_expected/spectral_metadata_ms2.tsv @@ -0,0 +1,3 @@ +id msLevel rtime acquisitionNum dataOrigin centroided polarity precScanNum precursorMz precursorIntensity precursorCharge collisionEnergy isolationWindowLowerMz isolationWindowTargetMz isolationWindowUpperMz peaksCount totIonCurrent basePeakMZ basePeakIntensity ionisationEnergy lowMZ highMZ injectionTime filterString spectrumId scanWindowLowerLimit scanWindowUpperLimit rtime_adjusted dataStorage scanIndex sample_id sample_name_ sample_group sample_type rtime_adjusted-rtime +3 2.0 0.613772143999998 3.0 path/sample_3.mzML True 0.0 1.0 203.081527709961 811281.937988281 0.0 30.0 202.481527709961 203.081527709961 203.681527709961 8.0 73490.75 155.461517333984 9954.2333984375 0.0 66.4943084716797 155.461517333984 22.0 FTMS - c ESI d Full ms2 203.0815@hcd30.00 [52.0000-214.0000] controllerType=0 controllerNumber=1 scan=3 52.0 214.0 0.613772143999998 path/sample_3.mzML 3.0 3.0 sample_3 WT QC 0.0 +4 2.0 0.719534496 4.0 path/sample_4.mzML True 0.0 1.0 219.84489440918 152150.111816406 1.0 30.0 219.24489440918 219.84489440918 220.44489440918 4.0 74733.0 219.844863891602 46147.5546875 0.0 90.0349273681641 219.844863891602 22.0 FTMS - c ESI d Full ms2 219.8449@hcd30.00 [53.0000-230.0000] controllerType=0 controllerNumber=1 scan=4 53.0 230.0 0.719534496 path/sample_4.mzML 4.0 4.0 sample_4 WT study 0.0 diff --git a/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_backend_data.txt b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_backend_data.txt new file mode 100644 index 0000000..75ef575 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_backend_data.txt @@ -0,0 +1,6 @@ +# MsBackendMzR +"msLevel" "rtime" "acquisitionNum" "dataOrigin" "polarity" "precScanNum" "precursorMz" "precursorIntensity" "precursorCharge" "collisionEnergy" "isolationWindowLowerMz" "isolationWindowTargetMz" "isolationWindowUpperMz" "peaksCount" "totIonCurrent" "basePeakMZ" "basePeakIntensity" "ionisationEnergy" "lowMZ" "highMZ" "injectionTime" "filterString" "spectrumId" "scanWindowLowerLimit" "scanWindowUpperLimit" "dataStorage" "scanIndex" +"1" 1 0.106081408 1 "path/sample_1.mzML" 0 NA NA NA NA NA NA NA NA 909 19614582 197.807357788086 2139308 0 61.9880981445313 755.029235839844 50 "FTMS - c ESI Full ms [60.0000-900.0000]" "controllerType=0 controllerNumber=1 scan=1" 60 900 "path/sample_1.mzML" 1 +"2" 1 0.536452816 2 "path/sample_2.mzML" 0 NA NA NA NA NA NA NA NA 968 20253388 197.807403564453 2154304.5 0 61.9871025085449 665.043334960938 50 "FTMS - c ESI Full ms [60.0000-900.0000]" "controllerType=0 controllerNumber=1 scan=2" 60 900 "path/sample_2.mzML" 2 +"3" 2 0.613772143999998 3 "path/sample_3.mzML" 0 1 203.081527709961 811281.937988281 0 30 202.481527709961 203.081527709961 203.681527709961 8 73490.75 155.461517333984 9954.2333984375 0 66.4943084716797 155.461517333984 22 "FTMS - c ESI d Full ms2 203.0815@hcd30.00 [52.0000-214.0000]" "controllerType=0 controllerNumber=1 scan=3" 52 214 "path/sample_3.mzML" 3 +"4" 2 0.719534496 4 "path/sample_4.mzML" 0 1 219.84489440918 152150.111816406 1 30 219.24489440918 219.84489440918 220.44489440918 4 74733 219.844863891602 46147.5546875 0 90.0349273681641 219.844863891602 22 "FTMS - c ESI d Full ms2 219.8449@hcd30.00 [53.0000-230.0000]" "controllerType=0 controllerNumber=1 scan=4" 53 230 "path/sample_4.mzML" 4 diff --git a/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data.txt b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data.txt new file mode 100644 index 0000000..8d8c342 --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data.txt @@ -0,0 +1,5 @@ +"sample_name" "sample_group" "sample_type" "spectraOrigin" +"1" "sample_1" "KO" "QC" "path/sample_1.mzML" +"2" "sample_2" "KO" "study" "path/sample_2.mzML" +"3" "sample_3" "WT" "QC" "path/sample_3.mzML" +"4" "sample_4" "WT" "study" "path/sample_4.mzML" diff --git a/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data_links_spectra.txt b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data_links_spectra.txt new file mode 100644 index 0000000..a7c79bd --- /dev/null +++ b/q2_ms/xcms/tests/data/metadata_no_centroided_rt/ms_experiment_sample_data_links_spectra.txt @@ -0,0 +1,4 @@ +1 1 +2 2 +3 3 +4 4 diff --git a/q2_ms/xcms/tests/test_metadata.py b/q2_ms/xcms/tests/test_metadata.py new file mode 100644 index 0000000..e55cf09 --- /dev/null +++ b/q2_ms/xcms/tests/test_metadata.py @@ -0,0 +1,63 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2025, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os + +import pandas as pd +from pandas._testing import assert_frame_equal +from qiime2.plugin.testing import TestPluginBase + +from q2_ms.types import XCMSExperimentDirFmt +from q2_ms.xcms.metadata import create_spectral_metadata + + +class TestMetadata(TestPluginBase): + package = "q2_ms.xcms.tests" + + def test_create_spectral_metadata(self): + xcms_experiment = XCMSExperimentDirFmt(self.get_data_path("metadata"), "r") + obs = create_spectral_metadata(xcms_experiment) + self._test_create_spectral_metadata_helper(obs, "spectral_metadata_ms1.tsv") + + def test_create_spectral_metadata_no_centroided_rt_adjusted(self): + xcms_experiment = XCMSExperimentDirFmt( + self.get_data_path("metadata_no_centroided_rt"), "r" + ) + obs = create_spectral_metadata(xcms_experiment) + self._test_create_spectral_metadata_helper( + obs, "spectral_metadata_ms1_no_centroided_rt_adjusted.tsv" + ) + + def test_create_spectral_metadata_ms2(self): + xcms_experiment = XCMSExperimentDirFmt(self.get_data_path("metadata"), "r") + obs = create_spectral_metadata(xcms_experiment, "2") + self._test_create_spectral_metadata_helper(obs, "spectral_metadata_ms2.tsv") + + def _test_create_spectral_metadata_helper(self, obs, exp_metadata): + exp = pd.read_csv( + self.get_data_path(os.path.join("metadata_expected", exp_metadata)), + sep="\t", + index_col=0, + ) + exp.index = exp.index.astype(str) + columns_to_convert = [ + "msLevel", + "acquisitionNum", + "polarity", + "peaksCount", + "ionisationEnergy", + "injectionTime", + "scanWindowLowerLimit", + "scanWindowUpperLimit", + "scanIndex", + "sample_id", + ] + exp[columns_to_convert] = exp[columns_to_convert].astype("float64") + if "centroided" in exp.columns: + exp["centroided"] = exp["centroided"].astype("str") + + assert_frame_equal(obs.to_dataframe(), exp)