diff --git a/pysyndna/__init__.py b/pysyndna/__init__.py index 4f20674..d250891 100644 --- a/pysyndna/__init__.py +++ b/pysyndna/__init__.py @@ -3,6 +3,7 @@ FIT_SYNDNA_MODELS_LOG_KEY from pysyndna.src.calc_cell_counts import calc_ogu_cell_counts_biom, \ calc_ogu_cell_counts_per_g_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input, \ calc_ogu_cell_counts_per_cm2_of_sample_for_qiita, \ calc_ogu_cell_counts_per_ul_of_sample_for_qiita, \ SAMPLE_IN_ALIQUOT_MASS_G_KEY, SAMPLE_SURFACE_AREA_CM2_KEY, \ @@ -21,6 +22,7 @@ 'fit_linear_regression_models_for_qiita', 'calc_ogu_cell_counts_biom', 'calc_ogu_cell_counts_per_g_of_sample_for_qiita', + 'calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input', 'calc_ogu_cell_counts_per_cm2_of_sample_for_qiita', 'calc_ogu_cell_counts_per_ul_of_sample_for_qiita', 'read_ogu_orf_coords_to_df', diff --git a/pysyndna/src/calc_cell_counts.py b/pysyndna/src/calc_cell_counts.py index fb8f5fe..335f57b 100644 --- a/pysyndna/src/calc_cell_counts.py +++ b/pysyndna/src/calc_cell_counts.py @@ -1,6 +1,7 @@ import biom import numpy as np import pandas as pd +from warnings import deprecated import yaml from typing import Optional, Union, Dict, List from pysyndna.src.util import calc_copies_genomic_element_per_g_series, \ @@ -218,7 +219,10 @@ def _validate_ogu_ids_in_inputs( # lengths because we don't really care about any of them that aren't *also* # in the reads data, and we've already checked those for consistency. -def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + +@deprecated("This function has been deprecated; " + "please use _calc_ogu_cell_counts_per_x_of_sample_for_qiita instead.") +def _calc_ogu_cell_counts_per_x_of_sample_for_qiita_split_input( sample_info_df: pd.DataFrame, prep_info_df: pd.DataFrame, linregress_by_sample_id_fp: str, @@ -261,6 +265,12 @@ def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( String containing the filepath to a tab-separated, two-column, no-header file in which the first column is the OGU id and the second is the OGU length in basepairs + output_cell_counts_metric: str + String indicating the metric to use for the output cell counts. Must + be one of OGU_CELLS_PER_G_OF_SAMPLE_KEY, OGU_CELLS_PER_UL_OF_SAMPLE_KEY, + or OGU_CELLS_PER_CM2_OF_SAMPLE_KEY. This determines the units of the + output cell counts and also which column in the sample info is used for + the per-sample calculation. min_coverage : float Minimum allowable coverage of an OGU in a sample needed to include that OGU/sample in the output, expressed in the same units @@ -295,6 +305,93 @@ def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( _ = validate_id_consistency_between_datasets( sample_info_df, prep_info_df, "sample info", "prep info", True) + # merge the sample info and prep info dataframes + absolute_quant_params_per_sample_df = \ + sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') + + out_txt_by_out_type = _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + absolute_quant_params_per_sample_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, output_cell_counts_metric, + min_coverage, min_rsquared, syndna_mass_fraction_of_sample + ) + + return out_txt_by_out_type + + +def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + absolute_quant_params_per_sample_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + output_cell_counts_metric: str, + min_coverage: float, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: + + """Gets # of cells of each OGU/g of sample for samples from Qiita. + + Parameters + ---------- + absolute_quant_params_per_sample_df: pd.DataFrame + A Dataframe of at least SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, and SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY for each + sample in the prep. It should also have at least one of SAMPLE_VOLUME_UL_KEY, + SAMPLE_SURFACE_AREA_CM2_KEY, and/or SAMPLE_IN_ALIQUOT_MASS_G_KEY. + linregress_by_sample_id_fp: str + String containing the filepath to the yaml file holding the + dictionary keyed by sample id, containing for each sample a dictionary + representation of the sample's LinregressResult. + ogu_counts_per_sample_biom: biom.Table + Biom table holding the read counts aligned to each OGU in each sample. + ogu_coverage_df : pd.DataFrame + A DataFrame containing a column for OGU_ID_KEY and either a column for + OGU_PERCENT_COVERAGE_KEY (indicating the coverage is the same for all + samples) or a column for each sample id, which holds the coverage of + that OGU in that sample, expressed as either a fraction or a percent. + NOTE THAT IT IS UP TO THE USER TO ENSURE THAT THEY KNOW WHICH TYPE OF + VALUE (fraction or percent) IS BEING USED AND THAT THEY PROVIDE THE + APPROPRIATE min_coverage PARAMETER (e.g., 0.01 or 1 to drop <1% coverage). + ogu_lengths_fp : str + String containing the filepath to a tab-separated, two-column, + no-header file in which the first column is the OGU id and the + second is the OGU length in basepairs + output_cell_counts_metric: str + String indicating the metric to use for the output cell counts. Must + be one of OGU_CELLS_PER_G_OF_SAMPLE_KEY, OGU_CELLS_PER_UL_OF_SAMPLE_KEY, + or OGU_CELLS_PER_CM2_OF_SAMPLE_KEY. This determines the units of the + output cell counts and also which column in the sample info is used for + the per-sample calculation. + min_coverage : float + Minimum allowable coverage of an OGU in a sample needed to include + that OGU/sample in the output, expressed in the same units + (fraction or percent) as used in ogu_coverage_df. + min_rsquared: float + Minimum allowable R^2 value for the linear regression model for a + sample; any sample with an R^2 value less than this will be excluded + from the output. + syndna_mass_fraction_of_sample: float + Fraction of the mass of the sample that is added as syndna (usually + 0.05, which is to say 5%). + + Returns + ------- + output_by_out_type : dict of str or biom.Table + Dictionary of outputs keyed by their type Currently, the following keys + are defined: + CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of + cells per gram of sample material for each OGU in each sample. + CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. + """ + + required_prep_cols = list( + {INPUT_SYNDNA_POOL_MASS_NG_KEY} | set(REQUIRED_DNA_PREP_INFO_KEYS)) + validate_required_columns_exist( + absolute_quant_params_per_sample_df, required_prep_cols, + "absolute quant params per sample is missing required column(s)") + # cast in case the input comes in as string or something syndna_mass_fraction_of_sample = float(syndna_mass_fraction_of_sample) @@ -306,16 +403,12 @@ def _calc_ogu_cell_counts_per_x_of_sample_for_qiita( # the sample (added into the library prep in addition to the sample mass). # Therefore, if the syndna fraction is 0.05 or 5%, the mass of the sample # gDNA put into sequencing is 1/0.05 = 20x the mass of syndna pool added. - prep_info_df = cast_cols( - prep_info_df, [INPUT_SYNDNA_POOL_MASS_NG_KEY], True) - prep_info_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ - prep_info_df[INPUT_SYNDNA_POOL_MASS_NG_KEY] * \ + absolute_quant_params_per_sample_df = cast_cols( + absolute_quant_params_per_sample_df, [INPUT_SYNDNA_POOL_MASS_NG_KEY], True) + absolute_quant_params_per_sample_df[SEQUENCED_SAMPLE_GDNA_MASS_NG_KEY] = \ + absolute_quant_params_per_sample_df[INPUT_SYNDNA_POOL_MASS_NG_KEY] * \ (1 / syndna_mass_fraction_of_sample) - # merge the sample info and prep info dataframes - absolute_quant_params_per_sample_df = \ - sample_info_df.merge(prep_info_df, on=SAMPLE_ID_KEY, how='left') - # read in the linregress_by_sample_id yaml file with open(linregress_by_sample_id_fp) as f: linregress_by_sample_id = yaml.load(f, Loader=yaml.FullLoader) @@ -961,8 +1054,9 @@ def calc_ogu_cell_counts_biom( return ogu_cell_counts_biom, log_msgs_list - -def calc_ogu_cell_counts_per_g_of_sample_for_qiita( +@deprecated("This function has been deprecated; " + "please use calc_ogu_cell_counts_per_g_of_sample_for_qiita instead.") +def calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df: pd.DataFrame, prep_info_df: pd.DataFrame, linregress_by_sample_id_fp: str, @@ -1030,14 +1124,88 @@ def calc_ogu_cell_counts_per_g_of_sample_for_qiita( sample_info_df, [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY], "sample info is missing required column(s)") - return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, linregress_by_sample_id_fp, ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, OGU_CELLS_PER_G_OF_SAMPLE_KEY, min_coverage, min_rsquared, syndna_mass_fraction_of_sample) -def calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( +def calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + min_coverage: float, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: + """Calculates the number of cells per gram of sample material. + + Parameters + ---------- + prep_info_df: pd.DataFrame + A Dataframe containing prep info for all samples in the prep, + including SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY, and + ELUTE_VOL_UL_KEY, INPUT_SYNDNA_POOL_MASS_NG_KEY, and + SAMPLE_IN_ALIQUOT_MASS_G_KEY. + linregress_by_sample_id_fp: str + String containing the filepath to the yaml file holding the + dictionary keyed by sample id, containing for each sample a dictionary + representation of the sample's LinregressResult. + ogu_counts_per_sample_biom: biom.Table + Biom table holding the read counts aligned to each OGU in each sample. + ogu_coverage_df : pd.DataFrame + A DataFrame containing a column for OGU_ID_KEY and either a column for + OGU_PERCENT_COVERAGE_KEY (indicating the coverage is the same for all + samples) or a column for each sample id, which holds the coverage of + that OGU in that sample, expressed as either a fraction or a percentage. + NOTE THAT IT IS UP TO THE USER TO ENSURE THAT THEY KNOW WHICH TYPE OF + VALUE (fraction or percent) IS BEING USED AND THAT THEY PROVIDE THE + APPROPRIATE min_coverage VALUE ACCORDINGLY. + ogu_lengths_fp : str + String containing the filepath to a tab-separated, two-column, + no-header file in which the first column is the OGU id and the + second is the OGU length in basepairs + min_coverage : float + Minimum allowable coverage of an OGU in a sample needed to include + that OGU/sample in the output, expressed in the same units + (fraction or percent) as used in ogu_coverage_df. + min_rsquared: float + Minimum allowable R^2 value for the linear regression model for a + sample; any sample with an R^2 value less than this will be excluded + from the output. + syndna_mass_fraction_of_sample: float + Fraction of the mass of the sample that is added as syndna (usually + 0.05, which is to say 5%). + + Returns + ------- + output_by_out_type : dict of str or biom.Table + Dictionary of outputs keyed by their type Currently, the following keys + are defined: + CELL_COUNT_RESULT_KEY: biom.Table holding the calculated number of + cells per gram of sample material for each OGU in each sample. + CELL_COUNT_LOG_KEY: log of messages from the cell count calc process. + """ + + # check if the input has the specific required columns + validate_required_columns_exist( + prep_info_df, [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY], + "prep info is missing required column(s)") + + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_G_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) + + +@deprecated("This function has been deprecated; " + "please use calc_ogu_cell_counts_per_cm2_of_sample_for_qiita instead.") +def calc_ogu_cell_counts_per_cm2_of_sample_for_qiita_split_input( sample_info_df: pd.DataFrame, prep_info_df: pd.DataFrame, linregress_by_sample_id_fp: str, @@ -1055,14 +1223,40 @@ def calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( sample_info_df, [SAMPLE_ID_KEY, SAMPLE_SURFACE_AREA_CM2_KEY], "sample info is missing required column(s)") - return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, linregress_by_sample_id_fp, ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, OGU_CELLS_PER_CM2_OF_SAMPLE_KEY, min_coverage, min_rsquared, syndna_mass_fraction_of_sample) -def calc_ogu_cell_counts_per_ul_of_sample_for_qiita( +def calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + min_coverage: float, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: + + # check if the input has the specific required columns + validate_required_columns_exist( + prep_info_df, [SAMPLE_ID_KEY, SAMPLE_SURFACE_AREA_CM2_KEY], + "prep info is missing required column(s)") + + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) + + +@deprecated("This function has been deprecated; " + "please use calc_ogu_cell_counts_per_ul_of_sample_for_qiita instead.") +def calc_ogu_cell_counts_per_ul_of_sample_for_qiita_split_input( sample_info_df: pd.DataFrame, prep_info_df: pd.DataFrame, linregress_by_sample_id_fp: str, @@ -1080,8 +1274,32 @@ def calc_ogu_cell_counts_per_ul_of_sample_for_qiita( sample_info_df, [SAMPLE_ID_KEY, SAMPLE_VOLUME_UL_KEY], "sample info is missing required column(s)") - return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, linregress_by_sample_id_fp, ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, OGU_CELLS_PER_UL_OF_SAMPLE_KEY, min_coverage, min_rsquared, syndna_mass_fraction_of_sample) + + +def calc_ogu_cell_counts_per_ul_of_sample_for_qiita( + prep_info_df: pd.DataFrame, + linregress_by_sample_id_fp: str, + ogu_counts_per_sample_biom: biom.Table, + ogu_coverage_df: pd.DataFrame, + ogu_lengths_fp: str, + min_coverage: float, + min_rsquared: float = DEFAULT_MIN_RSQUARED, + syndna_mass_fraction_of_sample: float = + DEFAULT_SYNDNA_MASS_FRACTION_OF_SAMPLE) \ + -> Dict[str, Union[str, biom.Table]]: + + # check if the input has the specific required columns + validate_required_columns_exist( + prep_info_df, [SAMPLE_ID_KEY, SAMPLE_VOLUME_UL_KEY], + "prep info is missing required column(s)") + + return _calc_ogu_cell_counts_per_x_of_sample_for_qiita( + prep_info_df, linregress_by_sample_id_fp, + ogu_counts_per_sample_biom, ogu_coverage_df, ogu_lengths_fp, + OGU_CELLS_PER_UL_OF_SAMPLE_KEY, min_coverage, min_rsquared, + syndna_mass_fraction_of_sample) diff --git a/pysyndna/tests/test_calc_cell_counts.py b/pysyndna/tests/test_calc_cell_counts.py index 196084c..00efa5e 100644 --- a/pysyndna/tests/test_calc_cell_counts.py +++ b/pysyndna/tests/test_calc_cell_counts.py @@ -8,8 +8,12 @@ from unittest import TestCase from pysyndna import calc_ogu_cell_counts_biom, \ calc_ogu_cell_counts_per_g_of_sample_for_qiita, \ + calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input, \ calc_ogu_cell_counts_per_cm2_of_sample_for_qiita, \ calc_ogu_cell_counts_per_ul_of_sample_for_qiita +from pysyndna.src.calc_cell_counts import \ + calc_ogu_cell_counts_per_cm2_of_sample_for_qiita_split_input, \ + calc_ogu_cell_counts_per_ul_of_sample_for_qiita_split_input from pysyndna.src.util import OGU_ID_KEY from pysyndna.src.fit_syndna_models import SAMPLE_TOTAL_READS_KEY from pysyndna.src.calc_cell_counts import SAMPLE_ID_KEY, ELUTE_VOL_UL_KEY, \ @@ -631,6 +635,20 @@ def generate_sample_names_list(cls, use_filtered=True): class TestCalcCellCounts(TestCase): + _DEFAULT_SAMPLE_PLUS_PREP_COLS = [ + SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY, + GDNA_CONCENTRATION_NG_UL_KEY, ELUTE_VOL_UL_KEY, + INPUT_SYNDNA_POOL_MASS_NG_KEY] + + @classmethod + def make_sample_plus_prep_input_dict(cls, cols_to_include=None): + if cols_to_include is None: + cols_to_include = cls._DEFAULT_SAMPLE_PLUS_PREP_COLS + sample_plus_prep_info_dict = { + k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() + for k in cols_to_include} + return sample_plus_prep_info_dict + def setUp(self): self.test_data_dir = os.path.join(os.path.dirname(__file__), 'data') @@ -638,6 +656,361 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): # example4 is the same as example2 except that the elute volume is 70; # see "absolute_quant_example.xlsx" for details. example4_elute_vol = 70 + + prep_info_dict = self.make_sample_plus_prep_input_dict() + # NOTE: this column is not needed anymore. It is left in this test + # just to show that the code can deal with extra columns (it just + # ignores them). + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + TestCalcCellCountsData.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + + # reset the sample ids and elute volume for example4 + sample_ids = ["example1", "example4"] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]), + TestCalcCellCountsData.example1_example4_results_dict[OGU_ID_KEY], + sample_ids) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): + # inputs are the same as in + # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita EXCEPT that + # all the inputs are strings, including ones that must be ints/floats. + # These are automatically cast to what they need to be by the function. + + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + + prep_info_dict = { + k: [str(x) for x in v] for k, v in + self.make_sample_plus_prep_input_dict().items()} + sample_ids = ["example1", "example4"] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = str(example4_elute_vol) + + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict).astype(str) + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]), + TestCalcCellCountsData.example1_example4_results_dict[OGU_ID_KEY], + sample_ids) + + # pass in strings for the numeric values to ensure they get cast + min_coverage = "10" + min_rsquared = "0.8" + + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_negs(self): + # inputs are the same as in + # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita EXCEPT that + # the "example4" sample has a negative aliquot mass and thus its + # results are removed from the output biom table + + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + + prep_info_dict = self.make_sample_plus_prep_input_dict() + sample_ids = ["example1", "example4"] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + prep_info_dict[SAMPLE_IN_ALIQUOT_MASS_G_KEY][1] = \ + -1 * prep_info_dict[SAMPLE_IN_ALIQUOT_MASS_G_KEY][1] + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + # Results are returned only for example 1 because example 4 has a + # negative aliquot mass + ogu_cell_counts_per_g_sample = np.array( + [[x[0]] for x in + TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_G_OF_SAMPLE_KEY]] + ) + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + ogu_cell_counts_per_g_sample, + TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + [sample_ids[0]]) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "Dropping samples with negative values in necessary " + "prep/sample column(s): example4\nThe following items have " + "coverage lower than the minimum of 10.0: ['Neisseria subflava', " + "'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self): + # missing required columns--deliberately not using helper since this + # test needs an incomplete dict + prep_info_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k] for k in + [SAMPLE_ID_KEY, GDNA_CONCENTRATION_NG_UL_KEY]} + + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + prep_info_dict[SAMPLE_ID_KEY]) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + + min_coverage = 1 + min_rsquared = 0.8 + + err_msg = r"prep info is missing required column\(s\): " \ + r"\['calc_mass_sample_aliquot_input_g'\]" + with self.assertRaisesRegex(ValueError, err_msg): + calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self): + prep_info_dict = self.make_sample_plus_prep_input_dict() + + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + # remove one of the sample ids from the prep info; this will cause + # an error because the biom table has a sample id not in prep info + prep_info_df = pd.DataFrame(prep_info_dict) + prep_info_df.drop(index=0, axis=0, inplace=True) + + sample_ids = TestCalcCellCountsData.sample_and_prep_input_dict[ + SAMPLE_ID_KEY] + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + + min_coverage = 10 + min_rsquared = 0.8 + + err_msg = (r"Found sample ids in OGU counts data that were not in" + r" sample info: \{'example1'\}") + with self.assertRaisesRegex(ValueError, err_msg): + calc_ogu_cell_counts_per_g_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita(self): + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + + prep_info_dict = self.make_sample_plus_prep_input_dict( + [SAMPLE_SURFACE_AREA_CM2_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, INPUT_SYNDNA_POOL_MASS_NG_KEY]) + # NOTE: this column is not needed anymore. It is left in this test + # just to show that the code can deal with extra columns (it just + # ignores them). + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + TestCalcCellCountsData.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + sample_ids = ["example1", "example4"] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_CM2_OF_SAMPLE_KEY]), + TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + sample_ids) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_ul_of_sample_for_qiita(self): + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 + + prep_info_dict = self.make_sample_plus_prep_input_dict( + [SAMPLE_VOLUME_UL_KEY, GDNA_CONCENTRATION_NG_UL_KEY, + ELUTE_VOL_UL_KEY, INPUT_SYNDNA_POOL_MASS_NG_KEY]) + # NOTE: this column is not needed anymore. It is left in this test + # just to show that the code can deal with extra columns (it just + # ignores them). + prep_info_dict[SAMPLE_TOTAL_READS_KEY] = \ + TestCalcCellCountsData.mass_and_totals_dict[SAMPLE_TOTAL_READS_KEY] + sample_ids = ["example1", "example4"] + prep_info_dict[SAMPLE_ID_KEY] = sample_ids + prep_info_dict[ELUTE_VOL_UL_KEY][1] = example4_elute_vol + + # example4 has the same counts as example2 + counts_vals = TestCalcCellCountsData.make_combined_counts_np_array() + + prep_info_df = pd.DataFrame(prep_info_dict) + counts_biom = biom.table.Table( + counts_vals, + TestCalcCellCountsData.ogu_lengths_dict[OGU_ID_KEY], + sample_ids) + coverages_df = pd.DataFrame( + TestCalcCellCountsData.ogu_percent_coverage_dict) + models_fp = os.path.join(self.test_data_dir, "models.yml") + lengths_fp = os.path.join(self.test_data_dir, "ogu_lengths.tsv") + # Note that, in the output, the ogu_ids are apparently sorted + # alphabetically--different than the input order + expected_out_biom = biom.table.Table( + np.array(TestCalcCellCountsData.example1_example4_results_dict[ + OGU_CELLS_PER_UL_OF_SAMPLE_KEY]), + TestCalcCellCountsData.reordered_results_dict[OGU_ID_KEY], + sample_ids) + + min_coverage = 10 + min_rsquared = 0.8 + + output_dict = calc_ogu_cell_counts_per_ul_of_sample_for_qiita( + prep_info_df, models_fp, counts_biom, + coverages_df, lengths_fp, min_coverage, min_rsquared) + + self.assertSetEqual( + set(output_dict.keys()), + {CELL_COUNT_RESULT_KEY, CELL_COUNT_LOG_KEY}) + + a_tester = Testers() + a_tester.assert_biom_tables_equal( + expected_out_biom, output_dict[CELL_COUNT_RESULT_KEY], + decimal_precision=1) + self.assertEqual( + "The following items have coverage lower than the minimum of " + "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", + output_dict[CELL_COUNT_LOG_KEY]) + + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input(self): + # example4 is the same as example2 except that the elute volume is 70; + # see "absolute_quant_example.xlsx" for details. + example4_elute_vol = 70 sample_ids = ["example1", "example4"] sample_info_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k].copy() for k in [SAMPLE_IN_ALIQUOT_MASS_G_KEY]} @@ -679,7 +1052,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): min_coverage = 10 min_rsquared = 0.8 - output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) @@ -696,11 +1069,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita(self): "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", output_dict[CELL_COUNT_LOG_KEY]) - def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input_w_casts(self): # inputs are the same as in - # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita EXCEPT that - # all the inputs are strings, including ones that must be ints/floats. - # These are automatically cast to what they need to be by the function. + # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input + # EXCEPT that all the inputs are strings, including ones that must be + # ints/floats. These are automatically cast to what they need to be. # example4 is the same as example2 except that the elute volume is 70; # see "absolute_quant_example.xlsx" for details. @@ -744,7 +1117,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): min_coverage = "10" min_rsquared = "0.8" - output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) @@ -761,9 +1134,9 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_casts(self): "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", output_dict[CELL_COUNT_LOG_KEY]) - def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_negs(self): + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input_w_negs(self): # inputs are the same as in - # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita EXCEPT that + # test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input EXCEPT that # the "example4" sample has a negative aliquot mass and thus its # results are removed from the output biom table @@ -816,7 +1189,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_negs(self): min_coverage = 10 min_rsquared = 0.8 - output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + output_dict = calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) @@ -835,7 +1208,7 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_negs(self): "'Haemophilus influenzae']", output_dict[CELL_COUNT_LOG_KEY]) - def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_sample_err(self): + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input_w_sample_err(self): # missing a required column column sample_info_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict for k in [SAMPLE_ID_KEY]} @@ -862,11 +1235,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_sample_err(self): err_msg = r"sample info is missing required column\(s\): " \ r"\['calc_mass_sample_aliquot_input_g'\]" with self.assertRaisesRegex(ValueError, err_msg): - calc_ogu_cell_counts_per_g_of_sample_for_qiita( + calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) - def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self): + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input_w_prep_err(self): sample_info_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k] for k in [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} @@ -893,11 +1266,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_prep_err(self): err_msg = r"prep info is missing required column\(s\): " \ r"\[\'mass_syndna_input_ng'\, 'vol_extracted_elution_ul'\]" with self.assertRaisesRegex(ValueError, err_msg): - calc_ogu_cell_counts_per_g_of_sample_for_qiita( + calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) - def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self): + def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input_w_ids_err(self): sample_info_dict = {k: TestCalcCellCountsData.sample_and_prep_input_dict[k] for k in [SAMPLE_ID_KEY, SAMPLE_IN_ALIQUOT_MASS_G_KEY]} @@ -929,11 +1302,11 @@ def test_calc_ogu_cell_counts_per_g_of_sample_for_qiita_w_ids_err(self): err_msg = (r"Found sample ids in prep info that were not in" r" sample info: \{'example1'\}") with self.assertRaisesRegex(ValueError, err_msg): - calc_ogu_cell_counts_per_g_of_sample_for_qiita( + calc_ogu_cell_counts_per_g_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) - def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita(self): + def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita_split_input(self): # example4 is the same as example2 except that the elute volume is 70; # see "absolute_quant_example.xlsx" for details. example4_elute_vol = 70 @@ -980,7 +1353,7 @@ def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita(self): min_coverage = 10 min_rsquared = 0.8 - output_dict = calc_ogu_cell_counts_per_cm2_of_sample_for_qiita( + output_dict = calc_ogu_cell_counts_per_cm2_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared) @@ -997,7 +1370,7 @@ def test_calc_ogu_cell_counts_per_cm2_of_sample_for_qiita(self): "10.0: ['Neisseria subflava', 'Haemophilus influenzae']", output_dict[CELL_COUNT_LOG_KEY]) - def test_calc_ogu_cell_counts_per_ul_of_sample_for_qiita(self): + def test_calc_ogu_cell_counts_per_ul_of_sample_for_qiita_split_input(self): # example4 is the same as example2 except that the elute volume is 70; # see "absolute_quant_example.xlsx" for details. example4_elute_vol = 70 @@ -1044,7 +1417,7 @@ def test_calc_ogu_cell_counts_per_ul_of_sample_for_qiita(self): min_coverage = 10 min_rsquared = 0.8 - output_dict = calc_ogu_cell_counts_per_ul_of_sample_for_qiita( + output_dict = calc_ogu_cell_counts_per_ul_of_sample_for_qiita_split_input( sample_info_df, prep_info_df, models_fp, counts_biom, coverages_df, lengths_fp, min_coverage, min_rsquared)