Merge pull request #337 from broadinstitute/dr_runtime_sample_renaming_GSA-974
GATK engine: add ability to do on-the-fly BAM file sample renaming at runtime
This commit is contained in:
commit
b992dcd9c2
|
|
@ -62,9 +62,11 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
||||
import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
|
|
@ -854,6 +856,10 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker;
|
||||
|
||||
final Map<SAMReaderID, String> sampleRenameMap = argCollection.sampleRenameMappingFile != null ?
|
||||
loadSampleRenameMap(argCollection.sampleRenameMappingFile) :
|
||||
null;
|
||||
|
||||
return new SAMDataSource(
|
||||
samReaderIDs,
|
||||
threadAllocation,
|
||||
|
|
@ -869,9 +875,63 @@ public class GenomeAnalysisEngine {
|
|||
includeReadsWithDeletionAtLoci(),
|
||||
argCollection.defaultBaseQualities,
|
||||
removeProgramRecords,
|
||||
keepReadsInLIBS);
|
||||
keepReadsInLIBS,
|
||||
sampleRenameMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory
|
||||
* HashMap. This file must consist of lines with two whitespace-separated fields:
|
||||
*
|
||||
* absolute_path_to_bam_file new_sample_name
|
||||
*
|
||||
* The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample
|
||||
* renaming feature is being used.
|
||||
*
|
||||
* @param sampleRenameMapFile sample rename map file from which to load data
|
||||
* @return a HashMap containing the contents of the map file, with the keys being the bam file paths and
|
||||
* the values being the new sample names.
|
||||
*/
|
||||
protected Map<SAMReaderID, String> loadSampleRenameMap( final File sampleRenameMapFile ) {
|
||||
logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath());
|
||||
|
||||
final Map<SAMReaderID, String> sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50);
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines(sampleRenameMapFile) ) {
|
||||
final String[] tokens = line.split("\\s+");
|
||||
|
||||
if ( tokens.length != 2 ) {
|
||||
throw new UserException.MalformedFile(sampleRenameMapFile,
|
||||
String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s",
|
||||
tokens.length, line));
|
||||
}
|
||||
|
||||
final File bamFile = new File(tokens[0]);
|
||||
final String newSampleName = tokens[1];
|
||||
|
||||
if ( ! bamFile.isAbsolute() ) {
|
||||
throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line);
|
||||
}
|
||||
|
||||
final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags());
|
||||
|
||||
if ( sampleRenameMap.containsKey(bamID) ) {
|
||||
throw new UserException.MalformedFile(sampleRenameMapFile,
|
||||
String.format("Bam file %s appears more than once", bamFile.getAbsolutePath()));
|
||||
}
|
||||
|
||||
sampleRenameMap.put(bamID, newSampleName);
|
||||
}
|
||||
}
|
||||
catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e);
|
||||
}
|
||||
|
||||
return sampleRenameMap;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Opens a reference sequence file paired with an index. Only public for testing purposes
|
||||
*
|
||||
|
|
|
|||
|
|
@ -281,6 +281,15 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false)
|
||||
public boolean keepProgramRecords = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file",
|
||||
doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " +
|
||||
"each BAM file listed in the mapping file have only a single sample specified in its header (though there " +
|
||||
"may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " +
|
||||
"to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.",
|
||||
required = false)
|
||||
public File sampleRenameMappingFile = null;
|
||||
|
||||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||
public ValidationExclusion.TYPE unsafe;
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import net.sf.samtools.*;
|
|||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -47,8 +48,10 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
|
|
@ -131,6 +134,11 @@ public class SAMDataSource {
|
|||
*/
|
||||
private final Map<SAMReaderID,ReadGroupMapping> originalToMergedReadGroupMappings = new HashMap<SAMReaderID,ReadGroupMapping>();
|
||||
|
||||
/**
|
||||
* Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming.
|
||||
*/
|
||||
private Map<SAMReaderID, String> sampleRenameMap = null;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
||||
|
|
@ -202,7 +210,8 @@ public class SAMDataSource {
|
|||
includeReadsWithDeletionAtLoci,
|
||||
(byte) -1,
|
||||
false,
|
||||
false);
|
||||
false,
|
||||
null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -219,6 +228,8 @@ public class SAMDataSource {
|
|||
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
||||
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
|
||||
* @param keepReadsInLIBS should we keep a unique list of reads in LIBS?
|
||||
* @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming.
|
||||
* Will be null if we're not doing sample renaming.
|
||||
*/
|
||||
public SAMDataSource(
|
||||
Collection<SAMReaderID> samFiles,
|
||||
|
|
@ -235,7 +246,9 @@ public class SAMDataSource {
|
|||
boolean includeReadsWithDeletionAtLoci,
|
||||
byte defaultBaseQualities,
|
||||
boolean removeProgramRecords,
|
||||
final boolean keepReadsInLIBS) {
|
||||
final boolean keepReadsInLIBS,
|
||||
final Map<SAMReaderID, String> sampleRenameMap) {
|
||||
|
||||
this.readMetrics = new ReadMetrics();
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
||||
|
|
@ -261,6 +274,8 @@ public class SAMDataSource {
|
|||
ReadShard.setReadBufferSize(100000);
|
||||
}
|
||||
|
||||
this.sampleRenameMap = sampleRenameMap;
|
||||
|
||||
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
|
|
@ -825,8 +840,31 @@ public class SAMDataSource {
|
|||
if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime()));
|
||||
|
||||
Collection<SAMFileHeader> headers = new LinkedList<SAMFileHeader>();
|
||||
for(SAMFileReader reader: readers.values())
|
||||
headers.add(reader.getFileHeader());
|
||||
|
||||
// Examine the bam headers, perform any requested sample renaming on them, and add
|
||||
// them to the list of headers to pass to the Picard SamFileHeaderMerger:
|
||||
for ( final Map.Entry<SAMReaderID, SAMFileReader> readerEntry : readers.entrySet() ) {
|
||||
final SAMReaderID readerID = readerEntry.getKey();
|
||||
final SAMFileReader reader = readerEntry.getValue();
|
||||
final SAMFileHeader header = reader.getFileHeader();
|
||||
|
||||
// The remappedSampleName will be null if either no on-the-fly sample renaming was requested,
|
||||
// or the user's sample rename map file didn't contain an entry for this bam file:
|
||||
final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null;
|
||||
|
||||
// If we've been asked to rename the sample for this bam file, do so now. We'll check to
|
||||
// make sure this bam only contains reads from one sample before proceeding.
|
||||
//
|
||||
// IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of
|
||||
// the existing read group attributes (including sample name) when merging
|
||||
// headers, regardless of whether there are read group collisions or not.
|
||||
if ( remappedSampleName != null ) {
|
||||
remapSampleName(readerID, header, remappedSampleName);
|
||||
}
|
||||
|
||||
headers.add(header);
|
||||
}
|
||||
|
||||
headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true);
|
||||
|
||||
// update all read groups to GATKSAMRecordReadGroups
|
||||
|
|
@ -837,6 +875,43 @@ public class SAMDataSource {
|
|||
headerMerger.getMergedHeader().setReadGroups(gatkReadGroups);
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the sample name in the read groups for the provided bam file header to match the
|
||||
* remappedSampleName. Blows up with a UserException if the header contains more than one
|
||||
* sample name.
|
||||
*
|
||||
* @param readerID ID for the bam file from which the provided header came from
|
||||
* @param header The bam file header. Will be modified by this call.
|
||||
* @param remappedSampleName New sample name to replace the existing sample attribute in the
|
||||
* read groups for the header.
|
||||
*/
|
||||
private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) {
|
||||
String firstEncounteredSample = null;
|
||||
|
||||
for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) {
|
||||
final String thisReadGroupSample = readGroup.getSample();
|
||||
|
||||
if ( thisReadGroupSample == null ) {
|
||||
throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " +
|
||||
"bam file contains a read group (id: %s) with a null sample attribute",
|
||||
readerID.getSamFilePath(), readGroup.getId()));
|
||||
}
|
||||
else if ( firstEncounteredSample == null ) {
|
||||
firstEncounteredSample = thisReadGroupSample;
|
||||
}
|
||||
else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) {
|
||||
throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " +
|
||||
"however this bam file contains reads from more than one sample " +
|
||||
"(encountered samples %s and %s in the bam header). The GATK requires that " +
|
||||
"all bams for which on-the-fly sample renaming is requested " +
|
||||
"contain reads from only a single sample per bam.",
|
||||
readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample));
|
||||
}
|
||||
|
||||
readGroup.setSample(remappedSampleName);
|
||||
}
|
||||
}
|
||||
|
||||
final private void printReaderPerformance(final int nExecutedTotal,
|
||||
final int nExecutedInTick,
|
||||
final int totalNumberOfFiles,
|
||||
|
|
|
|||
|
|
@ -26,10 +26,13 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -45,13 +48,12 @@ import org.broadinstitute.variant.vcf.VCFCodec;
|
|||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.TestException;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -278,6 +280,12 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, ""));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test engine-level cigar consolidation
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testGATKEngineConsolidatesCigars() {
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
|
|
@ -297,4 +305,232 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
// Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine:
|
||||
Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test on-the-fly sample renaming
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
// On-the-fly sample renaming test case: one single-sample bam with multiple read groups
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException {
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName"));
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" -o %s",
|
||||
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
|
||||
|
||||
final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0);
|
||||
final SAMFileReader reader = new SAMFileReader(outputBam);
|
||||
|
||||
for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) {
|
||||
Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId()));
|
||||
}
|
||||
|
||||
reader.close();
|
||||
}
|
||||
|
||||
// On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException {
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
|
||||
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891",
|
||||
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892"));
|
||||
|
||||
final Map<String, String> readGroupToNewSampleMap = new HashMap<>();
|
||||
for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) {
|
||||
final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID));
|
||||
final SAMFileReader inputBamReader = new SAMFileReader(inputBam);
|
||||
final String newSampleName = String.format("newSampleFor%s", inputBamID);
|
||||
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
|
||||
readGroupToNewSampleMap.put(readGroup.getId(), newSampleName);
|
||||
}
|
||||
inputBamReader.close();
|
||||
}
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" -o %s",
|
||||
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
|
||||
|
||||
final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0);
|
||||
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
|
||||
|
||||
int totalReadGroupsSeen = 0;
|
||||
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
|
||||
Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()),
|
||||
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
|
||||
totalReadGroupsSeen++;
|
||||
}
|
||||
|
||||
Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file");
|
||||
|
||||
outputBamReader.close();
|
||||
}
|
||||
|
||||
// On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam,
|
||||
// performing renaming in only SOME of the bams
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException {
|
||||
// Rename samples for NA12878 and NA12892, but not for NA12891
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
|
||||
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892"));
|
||||
|
||||
final Map<String, String> readGroupToNewSampleMap = new HashMap<>();
|
||||
for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) {
|
||||
final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID));
|
||||
final SAMFileReader inputBamReader = new SAMFileReader(inputBam);
|
||||
|
||||
// Special-case NA12891, which we're not renaming:
|
||||
final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID);
|
||||
|
||||
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
|
||||
readGroupToNewSampleMap.put(readGroup.getId(), newSampleName);
|
||||
}
|
||||
inputBamReader.close();
|
||||
}
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" -o %s",
|
||||
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
|
||||
|
||||
final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0);
|
||||
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
|
||||
|
||||
int totalReadGroupsSeen = 0;
|
||||
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
|
||||
Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()),
|
||||
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
|
||||
totalReadGroupsSeen++;
|
||||
}
|
||||
|
||||
Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file");
|
||||
|
||||
outputBamReader.close();
|
||||
}
|
||||
|
||||
// On-the-fly sample renaming test case: two single-sample bams with read group collisions
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException {
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
|
||||
privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878"));
|
||||
|
||||
final Set<String> na12878ReadGroups = new HashSet<>();
|
||||
final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam"));
|
||||
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
|
||||
na12878ReadGroups.add(readGroup.getId());
|
||||
}
|
||||
inputBamReader.close();
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" -o %s",
|
||||
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
|
||||
|
||||
final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0);
|
||||
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
|
||||
|
||||
int totalReadGroupsSeen = 0;
|
||||
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
|
||||
String expectedSampleName = "";
|
||||
if ( na12878ReadGroups.contains(readGroup.getId()) ) {
|
||||
expectedSampleName = "newSampleFor12878";
|
||||
}
|
||||
else {
|
||||
expectedSampleName = "newSampleForNot12878";
|
||||
}
|
||||
|
||||
Assert.assertEquals(readGroup.getSample(), expectedSampleName,
|
||||
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
|
||||
totalReadGroupsSeen++;
|
||||
}
|
||||
|
||||
Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file");
|
||||
|
||||
outputBamReader.close();
|
||||
}
|
||||
|
||||
// On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException)
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException {
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName"));
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" -o %s",
|
||||
1,
|
||||
UserException.class); // expecting a UserException here
|
||||
|
||||
executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec);
|
||||
}
|
||||
|
||||
// On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads
|
||||
@Test
|
||||
public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException {
|
||||
final File sampleRenameMapFile = createTestSampleRenameMapFile(
|
||||
Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName"));
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" +
|
||||
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
|
||||
" --newSampleName myNewSampleName" +
|
||||
" -L 20:10000000-10001000",
|
||||
1, Arrays.asList(""));
|
||||
|
||||
// Test is a success if our custom walker doesn't throw an exception
|
||||
executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec);
|
||||
}
|
||||
|
||||
private File createTestSampleRenameMapFile( final List<String> contents ) throws IOException {
|
||||
final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp");
|
||||
final PrintWriter writer = new PrintWriter(mapFile);
|
||||
|
||||
for ( final String line : contents ) {
|
||||
writer.println(line);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
return mapFile;
|
||||
}
|
||||
|
||||
public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true)
|
||||
String newSampleName = null;
|
||||
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) {
|
||||
throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s",
|
||||
newSampleName, read.getReadGroup().getSample()));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer value, Integer sum) { return value + sum; }
|
||||
}
|
||||
}
|
||||
|
|
@ -42,10 +42,9 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tests selected functionality in the GenomeAnalysisEngine class
|
||||
|
|
@ -104,6 +103,64 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
|
|||
testEngine.validateSuppliedIntervals();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLoadWellFormedSampleRenameMapFile() throws IOException {
|
||||
final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1",
|
||||
"/foo/bar/second.bam newSample2",
|
||||
"/foo/bar2/third.bam newSample3"));
|
||||
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
final Map<SAMReaderID, String> renameMap = engine.loadSampleRenameMap(mapFile);
|
||||
|
||||
Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file");
|
||||
|
||||
final Iterator<String> expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator();
|
||||
while ( expectedResultsIterator.hasNext() ) {
|
||||
final String expectedKey = expectedResultsIterator.next();
|
||||
final String expectedValue = expectedResultsIterator.next();
|
||||
|
||||
Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey));
|
||||
Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MalformedSampleRenameMapFileDataProvider")
|
||||
public Object[][] generateMalformedSampleRenameMapFiles() throws IOException {
|
||||
final List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile",
|
||||
new File("/foo/bar/nonexistent")});
|
||||
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1",
|
||||
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))});
|
||||
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2",
|
||||
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))});
|
||||
tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath",
|
||||
createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))});
|
||||
tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath",
|
||||
createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1",
|
||||
"/path/to/dupe.bam newSample2"))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class)
|
||||
public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) {
|
||||
logger.info("Executing test " + testName);
|
||||
|
||||
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
final Map<SAMReaderID, String> renameMap = engine.loadSampleRenameMap(mapFile);
|
||||
}
|
||||
|
||||
private File createTestSampleRenameMapFile( final List<String> contents ) throws IOException {
|
||||
final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp");
|
||||
final PrintWriter writer = new PrintWriter(mapFile);
|
||||
|
||||
for ( final String line : contents ) {
|
||||
writer.println(line);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
return mapFile;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Test the ReadTransformer ordering enforcement //
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ public class ReadMetricsUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, true);
|
||||
false, (byte)30, false, true, null);
|
||||
|
||||
engine.setReadsDataSource(dataSource);
|
||||
|
||||
|
|
@ -193,7 +193,7 @@ public class ReadMetricsUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, true);
|
||||
false, (byte)30, false, true, null);
|
||||
|
||||
engine.setReadsDataSource(dataSource);
|
||||
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
|
||||
|
|
@ -234,7 +234,7 @@ public class ReadMetricsUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, true);
|
||||
false, (byte)30, false, true, null);
|
||||
|
||||
engine.setReadsDataSource(dataSource);
|
||||
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
|
||||
|
|
@ -281,7 +281,7 @@ public class ReadMetricsUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
filters,
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, true);
|
||||
false, (byte)30, false, true, null);
|
||||
|
||||
engine.setReadsDataSource(dataSource);
|
||||
|
||||
|
|
|
|||
|
|
@ -183,7 +183,8 @@ public class SAMDataSourceUnitTest extends BaseTest {
|
|||
false,
|
||||
(byte) -1,
|
||||
removeProgramRecords,
|
||||
false);
|
||||
false,
|
||||
null);
|
||||
|
||||
List<SAMProgramRecord> dontRemoveProgramRecords = data.getHeader().getProgramRecords();
|
||||
assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false");
|
||||
|
|
@ -203,7 +204,8 @@ public class SAMDataSourceUnitTest extends BaseTest {
|
|||
false,
|
||||
(byte) -1,
|
||||
removeProgramRecords,
|
||||
false);
|
||||
false,
|
||||
null);
|
||||
|
||||
List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords();
|
||||
assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true");
|
||||
|
|
|
|||
|
|
@ -481,7 +481,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, true);
|
||||
false, (byte)30, false, true, null);
|
||||
|
||||
engine.setReadsDataSource(dataSource);
|
||||
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
|
||||
|
|
|
|||
Loading…
Reference in New Issue