From 605a5ac2e329055c974ad3ee6517ed5c68614379 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 3 Jul 2013 12:41:01 -0400 Subject: [PATCH] GATK engine: add ability to do on-the-fly BAM file sample renaming at runtime -User must provide a mapping file via new --sample_rename_mapping_file argument. Mapping file must contain a mapping from absolute bam file path to new sample name (format is described in the docs for the argument). -Requires that each bam file listed in the mapping file contain only one sample in their headers (they may contain multiple read groups for that sample, however). The engine enforces this, and throws a UserException if on-the-fly renaming is requested for a multi-sample bam. -Not all bam files for a traversal need to be listed in the mapping file. -On-the-fly renaming is done as the VERY first step after creating the SAMFileReaders in SAMDataSource (before the headers are even merged), to prevent possible consistency issues. -Renaming is done ONCE at traversal start for each SAMReaders resource creation in the SAMResourcePool; this effectively means once per -nt thread -Comprehensive unit/integration tests Known issues: -if you specify the absolute path to a bam in the mapping file, and then provide a path to that same bam to -I using SYMLINKS, the renaming won't work. The absolute paths will look different to the engine due to the symlink being present in one path and not in the other path. GSA-974 #resolve --- .../sting/gatk/GenomeAnalysisEngine.java | 62 ++++- .../arguments/GATKArgumentCollection.java | 9 + .../gatk/datasources/reads/SAMDataSource.java | 83 +++++- .../gatk/EngineFeaturesIntegrationTest.java | 244 +++++++++++++++++- .../gatk/GenomeAnalysisEngineUnitTest.java | 65 ++++- .../sting/gatk/ReadMetricsUnitTest.java | 8 +- .../reads/SAMDataSourceUnitTest.java | 6 +- .../TraverseActiveRegionsUnitTest.java | 2 +- 8 files changed, 459 insertions(+), 20 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f01e8ad62..27b030060 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -62,9 +62,11 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; +import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.io.File; +import java.io.FileNotFoundException; import java.util.*; import java.util.concurrent.TimeUnit; @@ -854,6 +856,10 @@ public class GenomeAnalysisEngine { final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + return new SAMDataSource( samReaderIDs, threadAllocation, @@ -869,9 +875,63 @@ public class GenomeAnalysisEngine { includeReadsWithDeletionAtLoci(), argCollection.defaultBaseQualities, removeProgramRecords, - keepReadsInLIBS); + keepReadsInLIBS, + sampleRenameMap); } + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields: + * + * absolute_path_to_bam_file new_sample_name + * + * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample + * renaming feature is being used. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+"); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File bamFile = new File(tokens[0]); + final String newSampleName = tokens[1]; + + if ( ! bamFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); + } + + final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); + + if ( sampleRenameMap.containsKey(bamID) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); + } + + sampleRenameMap.put(bamID, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + /** * Opens a reference sequence file paired with an index. Only public for testing purposes * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b38f0fc0b..509b875bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -281,6 +281,15 @@ public class GATKArgumentCollection { @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false) public boolean keepProgramRecords = false; + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", + doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " + + "each BAM file listed in the mapping file have only a single sample specified in its header (though there " + + "may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " + + "to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.", + required = false) + public File sampleRenameMappingFile = null; + @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index a36667ec4..ac2ed4a4c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -31,6 +31,7 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -47,8 +48,10 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; +import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; +import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; @@ -131,6 +134,11 @@ public class SAMDataSource { */ private final Map originalToMergedReadGroupMappings = new HashMap(); + /** + * Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming. + */ + private Map sampleRenameMap = null; + /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -202,7 +210,8 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, (byte) -1, false, - false); + false, + null); } /** @@ -219,6 +228,8 @@ public class SAMDataSource { * bases will be seen in the pileups, and the deletions will be skipped silently. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? + * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. + * Will be null if we're not doing sample renaming. */ public SAMDataSource( Collection samFiles, @@ -235,7 +246,9 @@ public class SAMDataSource { boolean includeReadsWithDeletionAtLoci, byte defaultBaseQualities, boolean removeProgramRecords, - final boolean keepReadsInLIBS) { + final boolean keepReadsInLIBS, + final Map sampleRenameMap) { + this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; @@ -261,6 +274,8 @@ public class SAMDataSource { ReadShard.setReadBufferSize(100000); } + this.sampleRenameMap = sampleRenameMap; + resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); @@ -825,8 +840,31 @@ public class SAMDataSource { if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); Collection headers = new LinkedList(); - for(SAMFileReader reader: readers.values()) - headers.add(reader.getFileHeader()); + + // Examine the bam headers, perform any requested sample renaming on them, and add + // them to the list of headers to pass to the Picard SamFileHeaderMerger: + for ( final Map.Entry readerEntry : readers.entrySet() ) { + final SAMReaderID readerID = readerEntry.getKey(); + final SAMFileReader reader = readerEntry.getValue(); + final SAMFileHeader header = reader.getFileHeader(); + + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this bam file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null; + + // If we've been asked to rename the sample for this bam file, do so now. We'll check to + // make sure this bam only contains reads from one sample before proceeding. + // + // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of + // the existing read group attributes (including sample name) when merging + // headers, regardless of whether there are read group collisions or not. + if ( remappedSampleName != null ) { + remapSampleName(readerID, header, remappedSampleName); + } + + headers.add(header); + } + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); // update all read groups to GATKSAMRecordReadGroups @@ -837,6 +875,43 @@ public class SAMDataSource { headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); } + /** + * Changes the sample name in the read groups for the provided bam file header to match the + * remappedSampleName. Blows up with a UserException if the header contains more than one + * sample name. + * + * @param readerID ID for the bam file from which the provided header came from + * @param header The bam file header. Will be modified by this call. + * @param remappedSampleName New sample name to replace the existing sample attribute in the + * read groups for the header. + */ + private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { + String firstEncounteredSample = null; + + for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { + final String thisReadGroupSample = readGroup.getSample(); + + if ( thisReadGroupSample == null ) { + throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + + "bam file contains a read group (id: %s) with a null sample attribute", + readerID.getSamFilePath(), readGroup.getId())); + } + else if ( firstEncounteredSample == null ) { + firstEncounteredSample = thisReadGroupSample; + } + else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { + throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + + "however this bam file contains reads from more than one sample " + + "(encountered samples %s and %s in the bam header). The GATK requires that " + + "all bams for which on-the-fly sample renaming is requested " + + "contain reads from only a single sample per bam.", + readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); + } + + readGroup.setSample(remappedSampleName); + } + } + final private void printReaderPerformance(final int nExecutedTotal, final int nExecutedInTick, final int totalNumberOfFiles, diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index aca6cf984..c65d62149 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -26,10 +26,13 @@ package org.broadinstitute.sting.gatk; import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broad.tribble.readers.AsciiLineReader; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -45,13 +48,12 @@ import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.testng.Assert; +import org.testng.TestException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileInputStream; -import java.io.PrintStream; -import java.util.Arrays; +import java.io.*; +import java.util.*; /** * @@ -278,6 +280,12 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); } + // -------------------------------------------------------------------------------- + // + // Test engine-level cigar consolidation + // + // -------------------------------------------------------------------------------- + @Test public void testGATKEngineConsolidatesCigars() { final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + @@ -297,4 +305,232 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); } + + // -------------------------------------------------------------------------------- + // + // Test on-the-fly sample renaming + // + // -------------------------------------------------------------------------------- + + // On-the-fly sample renaming test case: one single-sample bam with multiple read groups + @Test + public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId())); + } + + reader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final String newSampleName = String.format("newSampleFor%s", inputBamID); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, + // performing renaming in only SOME of the bams + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException { + // Rename samples for NA12878 and NA12892, but not for NA12891 + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + + // Special-case NA12891, which we're not renaming: + final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); + + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: two single-sample bams with read group collisions + @Test + public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); + + final Set na12878ReadGroups = new HashSet<>(); + final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + na12878ReadGroups.add(readGroup.getId()); + } + inputBamReader.close(); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + String expectedSampleName = ""; + if ( na12878ReadGroups.contains(readGroup.getId()) ) { + expectedSampleName = "newSampleFor12878"; + } + else { + expectedSampleName = "newSampleForNot12878"; + } + + Assert.assertEquals(readGroup.getSample(), expectedSampleName, + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) + @Test + public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec); + } + + // On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --newSampleName myNewSampleName" + + " -L 20:10000000-10001000", + 1, Arrays.asList("")); + + // Test is a success if our custom walker doesn't throw an exception + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker { + @Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true) + String newSampleName = null; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) { + throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s", + newSampleName, read.getReadGroup().getSample())); + } + + return 1; + } + + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index c3bc78551..84bc6e080 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -42,10 +42,9 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; /** * Tests selected functionality in the GenomeAnalysisEngine class @@ -104,6 +103,64 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } /////////////////////////////////////////////////// // Test the ReadTransformer ordering enforcement // diff --git a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java index 56725147e..02d0c66b9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java @@ -158,7 +158,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); @@ -193,7 +193,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); @@ -234,7 +234,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); @@ -281,7 +281,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), filters, new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 8d33aa8b6..52285fb2e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -183,7 +183,8 @@ public class SAMDataSourceUnitTest extends BaseTest { false, (byte) -1, removeProgramRecords, - false); + false, + null); List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); @@ -203,7 +204,8 @@ public class SAMDataSourceUnitTest extends BaseTest { false, (byte) -1, removeProgramRecords, - false); + false, + null); List doRemoveProgramRecords = data.getHeader().getProgramRecords(); assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index e4b6c37cc..30c0c83b5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -481,7 +481,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());