Skip to content

Commit 634727f

Browse files
committed
Allow DEA to be performed on a subset of the samples
1 parent f67c285 commit 634727f

File tree

10 files changed

+647
-424
lines changed

10 files changed

+647
-424
lines changed

gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java

Lines changed: 72 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import ubic.gemma.core.analysis.service.ExpressionDataFileService;
4040
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis;
4141
import ubic.gemma.model.common.auditAndSecurity.eventType.DifferentialExpressionAnalysisEvent;
42+
import ubic.gemma.model.expression.biomaterial.BioMaterial;
4243
import ubic.gemma.model.expression.experiment.*;
4344
import ubic.gemma.persistence.service.analysis.expression.diff.DifferentialExpressionAnalysisService;
4445

@@ -80,21 +81,40 @@ private enum Mode {
8081
COMPLETE_FACTORS
8182
}
8283

84+
/**
85+
* Mode for selecting which factors to include in the analysis.
86+
*/
87+
private enum FactorSelectionMode {
88+
/**
89+
* Select factors based on a previous analysis.
90+
*/
91+
REDO,
92+
/**
93+
* Select factors automatically.
94+
*/
95+
AUTOMATIC,
96+
/**
97+
* Select factors manually. The values that are in {@link #factorIdentifiers} will be considered.
98+
*/
99+
MANUAL
100+
}
101+
83102
/**
84103
* Specific analyses to redo.
85104
*/
86105
@Nullable
87-
private Collection<Long> analysisIds = null;
106+
private Collection<Long> analysisIds;
88107

89108
@Nullable
90-
private Collection<Long> subsetIds = null;
109+
private Collection<Long> subsetIds;
91110

92111
/**
93112
* Indicate the type of analysis to perform.
94113
* <p>
95114
* The default is to detect it based on the dataset and the requested factors.
96115
*/
97-
private AnalysisType type = null;
116+
@Nullable
117+
private AnalysisType type;
98118

99119
/**
100120
* Mode for selecting factors.
@@ -114,24 +134,33 @@ private enum Mode {
114134
private String subsetFactorIdentifier;
115135

116136
/**
117-
* Whether batch factors should be included (if they exist)
137+
* Whether batch factors should be included (if they exist).
138+
* <p>
139+
* Defaults to true.
118140
*/
119-
private boolean ignoreBatch = true;
141+
private boolean ignoreBatch;
120142

121143
/**
122144
* Use moderated statistics.
123145
*/
124-
private boolean moderateStatistics = DifferentialExpressionAnalysisConfig.DEFAULT_MODERATE_STATISTICS;
146+
private boolean moderateStatistics;
125147

126148
/**
127-
* Persist results to the database.
149+
* Ignore failure of subset analyses.
150+
* <p>
151+
* The analysis will still fail if all subset analyses fail.
128152
*/
129-
private boolean persist = true;
153+
private boolean ignoreFailingSubsets;
130154

131-
private boolean makeArchiveFiles = true;
132-
133-
private boolean ignoreFailingSubsets = false;
155+
/**
156+
* List of sample identifiers to include in the analysis.
157+
* <p>
158+
* Defaults to all samples.
159+
*/
160+
@Nullable
161+
private String[] sampleIdentifiers;
134162

163+
// data filtering options
135164
@Nullable
136165
private Integer filterMinNumberOfCellsPerSample;
137166
@Nullable
@@ -144,13 +173,22 @@ private enum Mode {
144173
@Nullable
145174
private Double filterMinVariance;
146175

147-
private DataFileOptionValue destination;
176+
/**
177+
* Persist results to the database.
178+
*/
179+
private boolean persist ;
148180

149-
enum FactorSelectionMode {
150-
REDO,
151-
AUTOMATIC,
152-
MANUAL
153-
}
181+
/**
182+
* Create archive files.
183+
*/
184+
private boolean makeArchiveFiles ;
185+
186+
/**
187+
* Destination to use for archive files.
188+
* <p>
189+
* This is only applicable if {@link #makeArchiveFiles} is true.
190+
*/
191+
private DataFileOptionValue destination;
154192

155193
@Override
156194
public String getCommandName() {
@@ -197,6 +235,9 @@ protected void buildExperimentOptions( Options options ) {
197235
+ "If the experiment already has subsets for the factor, those will be reused. "
198236
+ "This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset." ).get() );
199237

238+
addSingleExperimentOption( options, Option.builder( "samples" ).longOpt( "samples" ).argName( "ID, name, accession" )
239+
.desc( "ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the " + formatOption( "nodb", "no-db" ) + " option to be set." ).get() );
240+
200241
options.addOption( "usebatch", "use-batch-factor", false, "If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with " + formatOption( options, "factors" ) + ", -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset." );
201242
options.addOption( "nobayes", "no-bayes", false, "Do not apply empirical-Bayes moderated statistics. Default is to use eBayes." );
202243
options.addOption( "ignoreFailingSubsets", "ignore-failing-subsets", false, "Ignore failing subsets and continue processing other subsets. Requires the " + formatOption( options, "subset" ) + " option to be set or -redo,--redo option with existing subset analyses." );
@@ -205,7 +246,7 @@ protected void buildExperimentOptions( Options options ) {
205246

206247
// destination (db, standard location or custom directory)
207248
options.addOption( "nodb", "no-db", false, "Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by " + formatOption( options, DataFileOptionsUtils.OUTPUT_DIR_OPTION ) + ")." );
208-
options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + "." );
249+
options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + " option to be set." );
209250

210251
// redo mode
211252
options.addOption( "redo", "redo", false,
@@ -346,6 +387,7 @@ protected void processExperimentOptions( CommandLine commandLine ) throws ParseE
346387
// subset analysis can only be done in manual mode
347388
// note we add the given factor to the list of factors overall to make sure it is considered
348389
this.subsetFactorIdentifier = getOptionValue( commandLine, "subset", requires( allOf( toBeUnset( "redo" ), toBeUnset( "redoAnalysis" ), toBeUnset( "redoSubset" ) ) ) );
390+
this.sampleIdentifiers = getOptionValues( commandLine, "samples", requires( toBeSet( "nodb" ) ) );
349391
// we can only force the use of a batch factor during automatic selection
350392
this.ignoreBatch = !hasOption( commandLine, "usebatch", requires( allOf( toBeUnset( "factors" ), toBeUnset( "redo" ), toBeUnset( "redoAnalysis" ), toBeUnset( "redoSubset" ) ) ) );
351393
this.moderateStatistics = !commandLine.hasOption( "nobayes" );
@@ -433,7 +475,18 @@ protected void processExpressionExperiment( ExpressionExperiment ee ) {
433475
config.setPersist( this.persist );
434476
config.setMakeArchiveFile( this.persist && this.makeArchiveFiles );
435477
config.setIgnoreFailingSubsets( this.ignoreFailingSubsets );
436-
config.setUseWeights( super.eeService.isRNASeq( ee ) );
478+
config.setUseWeights( eeService.isRNASeq( ee ) );
479+
480+
// sample selection
481+
if ( this.sampleIdentifiers != null ) {
482+
Set<BioMaterial> samplesToUse = new HashSet<>();
483+
for ( String sampleId : sampleIdentifiers ) {
484+
// allow searching in subsets, those are used in single-cell diff ex. analysis that use sub-samples for
485+
// representing pseudo-bulks
486+
samplesToUse.add( entityLocator.locateSample( ee, sampleId, true ) );
487+
}
488+
config.setSamplesToInclude( samplesToUse );
489+
}
437490

438491
// filtering
439492
config.setRepetitiveValuesFilterMode( this.filterMode );

gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ public enum CompletionType {
77
DATASET_GROUP,
88
EXTERNAL_DATABASE,
99
DATASET,
10+
SAMPLE,
1011
EXPERIMENTAL_FACTOR,
1112
/**
1213
* Complete experimental factors and interactions suitable for DEA.

gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocator.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
import ubic.gemma.model.expression.bioAssayData.CellLevelCharacteristics;
99
import ubic.gemma.model.expression.bioAssayData.CellTypeAssignment;
1010
import ubic.gemma.model.expression.bioAssayData.DataVector;
11+
import ubic.gemma.model.expression.biomaterial.BioMaterial;
1112
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
1213
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
14+
import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
1315
import ubic.gemma.model.genome.Taxon;
1416

1517
import java.util.Collection;
@@ -39,9 +41,35 @@ public interface EntityLocator {
3941

4042
ExperimentalFactor locateExperimentalFactor( ExpressionExperiment expressionExperiment, String ctfName );
4143

42-
BioAssay locateBioAssay( ExpressionExperiment ee, String sampleId );
44+
/**
45+
* Locate an assay by its identifier.
46+
*
47+
* @param ee dataset to lookup
48+
* @param assayId assay identifier to lookup
49+
* @param includeSubSets whether to include assays that belong to subsets of the experiment. This is only relevant
50+
* for {@link ExpressionExperimentSubSet} that "own" their assays instead of sharing them with
51+
* the source experiment.
52+
*/
53+
BioAssay locateBioAssay( ExpressionExperiment ee, String assayId, boolean includeSubSets );
4354

55+
/**
56+
* Locate an assay by its identifier in a particular set of vectors.
57+
*
58+
* @param ee dataset to lookup
59+
* @param quantitationType quantitation type for the vectors
60+
* @param sampleId sample identifier to lookup
61+
*/
4462
BioAssay locateBioAssay( ExpressionExperiment ee, QuantitationType quantitationType, String sampleId );
4563

64+
/**
65+
*
66+
* @param ee dataset to lookup
67+
* @param sampleId sample identifier to lookup
68+
* @param includeSubSets whether to include samples associated to assays that belong to subsets of the experiment.
69+
* This is only relevant for {@link ExpressionExperimentSubSet}
70+
* that "own" their assays instead of sharing them with the source experiment.
71+
*/
72+
BioMaterial locateSample( ExpressionExperiment ee, String sampleId, boolean includeSubSets );
73+
4674
DifferentialExpressionAnalysis locateDiffExAnalysis( ExpressionExperiment ee, String analysisIdentifier );
4775
}

gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocatorImpl.java

Lines changed: 59 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import lombok.extern.apachecommons.CommonsLog;
44
import org.apache.commons.lang3.StringUtils;
5+
import org.apache.commons.lang3.Strings;
56
import org.apache.commons.lang3.tuple.Pair;
67
import org.springframework.beans.factory.annotation.Autowired;
78
import org.springframework.stereotype.Component;
@@ -15,6 +16,7 @@
1516
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
1617
import ubic.gemma.model.expression.bioAssay.BioAssay;
1718
import ubic.gemma.model.expression.bioAssayData.*;
19+
import ubic.gemma.model.expression.biomaterial.BioMaterial;
1820
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
1921
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
2022
import ubic.gemma.model.genome.Taxon;
@@ -268,7 +270,7 @@ public ExperimentalFactor locateExperimentalFactor( ExpressionExperiment express
268270
ExperimentalFactor factor;
269271
try {
270272
Long efId = Long.parseLong( identifier );
271-
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getId().equals( efId ) ) ) != null ) {
273+
if ( ( factor = matchOneFactor( expressionExperiment, ef -> Objects.equals( ef.getId(), efId ) ) ) != null ) {
272274
return factor;
273275
}
274276
} catch ( NumberFormatException e ) {
@@ -287,16 +289,16 @@ public ExperimentalFactor locateExperimentalFactor( ExpressionExperiment express
287289
}
288290

289291
// match by category
290-
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getCategory(), finalIdentifier ) ) ) != null ) {
292+
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getCategory(), finalIdentifier ) ) ) != null ) {
291293
return factor;
292294
}
293-
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getCategoryUri(), finalIdentifier ) ) ) != null ) {
295+
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getCategoryUri(), finalIdentifier ) ) ) != null ) {
294296
return factor;
295297
}
296-
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getValue(), finalIdentifier ) ) ) != null ) {
298+
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getValue(), finalIdentifier ) ) ) != null ) {
297299
return factor;
298300
}
299-
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getValueUri(), finalIdentifier ) ) ) != null ) {
301+
if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getValueUri(), finalIdentifier ) ) ) != null ) {
300302
return factor;
301303
}
302304

@@ -321,10 +323,9 @@ private ExperimentalFactor matchOneFactor( ExpressionExperiment ee, Predicate<Ex
321323
}
322324

323325
@Override
324-
public BioAssay locateBioAssay( ExpressionExperiment ee, String sampleId ) {
325-
ee = eeService.thawLite( ee );
326-
return requireNonNull( locateBioAssay( ee.getBioAssays(), sampleId ),
327-
"Could not locate any assay matching '" + sampleId + "' in " + ee.getShortName() + "." + formatPossibleValues( ee.getBioAssays(), true ) );
326+
public BioAssay locateBioAssay( ExpressionExperiment ee, String assayId, boolean includeSubSets ) {
327+
return requireNonNull( locateBioAssay( eeService.getBioAssays( ee, includeSubSets ), assayId ),
328+
"Could not locate any assay matching '" + assayId + "' in " + ee.getShortName() + "." + formatPossibleValues( ee.getBioAssays(), true ) );
328329
}
329330

330331
@Override
@@ -342,47 +343,80 @@ public BioAssay locateBioAssay( ExpressionExperiment ee, QuantitationType qt, St
342343
throw new NullPointerException();
343344
}
344345

345-
@Override
346-
public DifferentialExpressionAnalysis locateDiffExAnalysis( ExpressionExperiment ee, String analysisIdentifier ) {
347-
return requireNonNull( differentialExpressionAnalysisService.findByExperimentAndAnalysisId( ee, true, Long.parseLong( analysisIdentifier ) ),
348-
() -> String.format( "Could not locate an analysis matching '%s' in %s.%s",
349-
analysisIdentifier,
350-
ee.getShortName(),
351-
formatPossibleValues( differentialExpressionAnalysisService.findByExperiment( ee, true ), false ) ) );
352-
}
353-
354346
@Nullable
355-
private BioAssay locateBioAssay( Collection<BioAssay> ee, String sampleId ) {
347+
private BioAssay locateBioAssay( Collection<BioAssay> candidates, String sampleId ) {
356348
BioAssay ba;
357349
try {
358350
Long id = Long.parseLong( sampleId );
359-
if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getId().equals( id ) ) ) != null ) {
351+
if ( ( ba = matchOneAssay( candidates, ba2 -> Objects.equals( ba2.getId(), id ) ) ) != null ) {
360352
return ba;
361353
}
362354
} catch ( NumberFormatException e ) {
363355
// ignore
364356
}
365-
if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getShortName() != null && ba2.getShortName().equalsIgnoreCase( sampleId ) ) ) != null ) {
357+
if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getShortName() != null && ba2.getShortName().equalsIgnoreCase( sampleId ) ) ) != null ) {
358+
return ba;
359+
}
360+
if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) {
366361
return ba;
367362
}
368-
if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) {
363+
if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getAccession() != null && ba2.getAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) {
364+
return ba;
365+
}
366+
return null;
367+
}
368+
369+
private BioAssay matchOneAssay( Collection<BioAssay> candidates, Predicate<BioAssay> ba ) {
370+
Set<BioAssay> bas = candidates.stream().filter( ba ).collect( Collectors.toSet() );
371+
if ( bas.size() == 1 ) {
372+
return bas.iterator().next();
373+
} else {
374+
return null;
375+
}
376+
}
377+
378+
@Override
379+
public BioMaterial locateSample( ExpressionExperiment ee, String sampleId, boolean includeSubSets ) {
380+
return locateSample( eeService.getSamplesUsed( ee, includeSubSets ), sampleId );
381+
}
382+
383+
private BioMaterial locateSample( Collection<BioMaterial> candidates, String sampleId ) {
384+
BioMaterial ba;
385+
try {
386+
Long id = Long.parseLong( sampleId );
387+
if ( ( ba = matchOneSample( candidates, ba2 -> Objects.equals( ba2.getId(), id ) ) ) != null ) {
388+
return ba;
389+
}
390+
} catch ( NumberFormatException e ) {
391+
// ignore
392+
}
393+
if ( ( ba = matchOneSample( candidates, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) {
369394
return ba;
370395
}
371-
if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getAccession() != null && ba2.getAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) {
396+
if ( ( ba = matchOneSample( candidates, ba2 -> ba2.getExternalAccession() != null && ba2.getExternalAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) {
372397
return ba;
373398
}
374399
return null;
375400
}
376401

377-
private BioAssay matchOneAssay( Collection<BioAssay> bioAssays, Predicate<BioAssay> ba ) {
378-
Set<BioAssay> bas = bioAssays.stream().filter( ba ).collect( Collectors.toSet() );
402+
private BioMaterial matchOneSample( Collection<BioMaterial> candidates, Predicate<BioMaterial> ba ) {
403+
Set<BioMaterial> bas = candidates.stream().filter( ba ).collect( Collectors.toSet() );
379404
if ( bas.size() == 1 ) {
380405
return bas.iterator().next();
381406
} else {
382407
return null;
383408
}
384409
}
385410

411+
@Override
412+
public DifferentialExpressionAnalysis locateDiffExAnalysis( ExpressionExperiment ee, String analysisIdentifier ) {
413+
return requireNonNull( differentialExpressionAnalysisService.findByExperimentAndAnalysisId( ee, true, Long.parseLong( analysisIdentifier ) ),
414+
() -> String.format( "Could not locate an analysis matching '%s' in %s.%s",
415+
analysisIdentifier,
416+
ee.getShortName(),
417+
formatPossibleValues( differentialExpressionAnalysisService.findByExperiment( ee, true ), false ) ) );
418+
}
419+
386420
private String formatPossibleValues( Collection<? extends Identifiable> possibleValues, boolean allowAmbiguousIds ) {
387421
if ( possibleValues.isEmpty() ) {
388422
return "";

0 commit comments

Comments
 (0)