Merge pull request #337 from broadinstitute/dr_runtime_sample_renaming_GSA-974

GATK engine: add ability to do on-the-fly BAM file sample renaming at runtime
This commit is contained in:
droazen 2013-07-18 12:51:02 -07:00
commit b992dcd9c2
8 changed files with 459 additions and 20 deletions

View File

@ -62,9 +62,11 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.TimeUnit;
@ -854,6 +856,10 @@ public class GenomeAnalysisEngine {
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker;
final Map<SAMReaderID, String> sampleRenameMap = argCollection.sampleRenameMappingFile != null ?
loadSampleRenameMap(argCollection.sampleRenameMappingFile) :
null;
return new SAMDataSource(
samReaderIDs,
threadAllocation,
@ -869,9 +875,63 @@ public class GenomeAnalysisEngine {
includeReadsWithDeletionAtLoci(),
argCollection.defaultBaseQualities,
removeProgramRecords,
keepReadsInLIBS);
keepReadsInLIBS,
sampleRenameMap);
}
/**
* Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory
* HashMap. This file must consist of lines with two whitespace-separated fields:
*
* absolute_path_to_bam_file new_sample_name
*
* The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample
* renaming feature is being used.
*
* @param sampleRenameMapFile sample rename map file from which to load data
* @return a HashMap containing the contents of the map file, with the keys being the bam file paths and
* the values being the new sample names.
*/
protected Map<SAMReaderID, String> loadSampleRenameMap( final File sampleRenameMapFile ) {
logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath());
final Map<SAMReaderID, String> sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50);
try {
for ( final String line : new XReadLines(sampleRenameMapFile) ) {
final String[] tokens = line.split("\\s+");
if ( tokens.length != 2 ) {
throw new UserException.MalformedFile(sampleRenameMapFile,
String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s",
tokens.length, line));
}
final File bamFile = new File(tokens[0]);
final String newSampleName = tokens[1];
if ( ! bamFile.isAbsolute() ) {
throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line);
}
final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags());
if ( sampleRenameMap.containsKey(bamID) ) {
throw new UserException.MalformedFile(sampleRenameMapFile,
String.format("Bam file %s appears more than once", bamFile.getAbsolutePath()));
}
sampleRenameMap.put(bamID, newSampleName);
}
}
catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e);
}
return sampleRenameMap;
}
/**
* Opens a reference sequence file paired with an index. Only public for testing purposes
*

View File

@ -281,6 +281,15 @@ public class GATKArgumentCollection {
@Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false)
public boolean keepProgramRecords = false;
@Advanced
@Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file",
doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " +
"each BAM file listed in the mapping file have only a single sample specified in its header (though there " +
"may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " +
"to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.",
required = false)
public File sampleRenameMappingFile = null;
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;

View File

@ -31,6 +31,7 @@ import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -47,8 +48,10 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
@ -131,6 +134,11 @@ public class SAMDataSource {
*/
private final Map<SAMReaderID,ReadGroupMapping> originalToMergedReadGroupMappings = new HashMap<SAMReaderID,ReadGroupMapping>();
/**
* Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming.
*/
private Map<SAMReaderID, String> sampleRenameMap = null;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(SAMDataSource.class);
@ -202,7 +210,8 @@ public class SAMDataSource {
includeReadsWithDeletionAtLoci,
(byte) -1,
false,
false);
false,
null);
}
/**
@ -219,6 +228,8 @@ public class SAMDataSource {
* bases will be seen in the pileups, and the deletions will be skipped silently.
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
* @param keepReadsInLIBS should we keep a unique list of reads in LIBS?
* @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming.
* Will be null if we're not doing sample renaming.
*/
public SAMDataSource(
Collection<SAMReaderID> samFiles,
@ -235,7 +246,9 @@ public class SAMDataSource {
boolean includeReadsWithDeletionAtLoci,
byte defaultBaseQualities,
boolean removeProgramRecords,
final boolean keepReadsInLIBS) {
final boolean keepReadsInLIBS,
final Map<SAMReaderID, String> sampleRenameMap) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
@ -261,6 +274,8 @@ public class SAMDataSource {
ReadShard.setReadBufferSize(100000);
}
this.sampleRenameMap = sampleRenameMap;
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
SAMReaders readers = resourcePool.getAvailableReaders();
@ -825,8 +840,31 @@ public class SAMDataSource {
if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime()));
Collection<SAMFileHeader> headers = new LinkedList<SAMFileHeader>();
for(SAMFileReader reader: readers.values())
headers.add(reader.getFileHeader());
// Examine the bam headers, perform any requested sample renaming on them, and add
// them to the list of headers to pass to the Picard SamFileHeaderMerger:
for ( final Map.Entry<SAMReaderID, SAMFileReader> readerEntry : readers.entrySet() ) {
final SAMReaderID readerID = readerEntry.getKey();
final SAMFileReader reader = readerEntry.getValue();
final SAMFileHeader header = reader.getFileHeader();
// The remappedSampleName will be null if either no on-the-fly sample renaming was requested,
// or the user's sample rename map file didn't contain an entry for this bam file:
final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null;
// If we've been asked to rename the sample for this bam file, do so now. We'll check to
// make sure this bam only contains reads from one sample before proceeding.
//
// IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of
// the existing read group attributes (including sample name) when merging
// headers, regardless of whether there are read group collisions or not.
if ( remappedSampleName != null ) {
remapSampleName(readerID, header, remappedSampleName);
}
headers.add(header);
}
headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true);
// update all read groups to GATKSAMRecordReadGroups
@ -837,6 +875,43 @@ public class SAMDataSource {
headerMerger.getMergedHeader().setReadGroups(gatkReadGroups);
}
/**
* Changes the sample name in the read groups for the provided bam file header to match the
* remappedSampleName. Blows up with a UserException if the header contains more than one
* sample name.
*
* @param readerID ID for the bam file from which the provided header came from
* @param header The bam file header. Will be modified by this call.
* @param remappedSampleName New sample name to replace the existing sample attribute in the
* read groups for the header.
*/
private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) {
String firstEncounteredSample = null;
for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) {
final String thisReadGroupSample = readGroup.getSample();
if ( thisReadGroupSample == null ) {
throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " +
"bam file contains a read group (id: %s) with a null sample attribute",
readerID.getSamFilePath(), readGroup.getId()));
}
else if ( firstEncounteredSample == null ) {
firstEncounteredSample = thisReadGroupSample;
}
else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) {
throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " +
"however this bam file contains reads from more than one sample " +
"(encountered samples %s and %s in the bam header). The GATK requires that " +
"all bams for which on-the-fly sample renaming is requested " +
"contain reads from only a single sample per bam.",
readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample));
}
readGroup.setSample(remappedSampleName);
}
}
final private void printReaderPerformance(final int nExecutedTotal,
final int nExecutedInTick,
final int totalNumberOfFiles,

View File

@ -26,10 +26,13 @@
package org.broadinstitute.sting.gatk;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -45,13 +48,12 @@ import org.broadinstitute.variant.vcf.VCFCodec;
import org.broadinstitute.variant.vcf.VCFHeader;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import org.testng.Assert;
import org.testng.TestException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileInputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.io.*;
import java.util.*;
/**
*
@ -278,6 +280,12 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, ""));
}
// --------------------------------------------------------------------------------
//
// Test engine-level cigar consolidation
//
// --------------------------------------------------------------------------------
@Test
public void testGATKEngineConsolidatesCigars() {
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
@ -297,4 +305,232 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
// Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine:
Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine");
}
// --------------------------------------------------------------------------------
//
// Test on-the-fly sample renaming
//
// --------------------------------------------------------------------------------
// On-the-fly sample renaming test case: one single-sample bam with multiple read groups
@Test
public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException {
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName"));
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" -o %s",
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0);
final SAMFileReader reader = new SAMFileReader(outputBam);
for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) {
Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId()));
}
reader.close();
}
// On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam
@Test
public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException {
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891",
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892"));
final Map<String, String> readGroupToNewSampleMap = new HashMap<>();
for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) {
final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID));
final SAMFileReader inputBamReader = new SAMFileReader(inputBam);
final String newSampleName = String.format("newSampleFor%s", inputBamID);
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
readGroupToNewSampleMap.put(readGroup.getId(), newSampleName);
}
inputBamReader.close();
}
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" -o %s",
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0);
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
int totalReadGroupsSeen = 0;
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()),
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
totalReadGroupsSeen++;
}
Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file");
outputBamReader.close();
}
// On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam,
// performing renaming in only SOME of the bams
@Test
public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException {
// Rename samples for NA12878 and NA12892, but not for NA12891
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892"));
final Map<String, String> readGroupToNewSampleMap = new HashMap<>();
for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) {
final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID));
final SAMFileReader inputBamReader = new SAMFileReader(inputBam);
// Special-case NA12891, which we're not renaming:
final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID);
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
readGroupToNewSampleMap.put(readGroup.getId(), newSampleName);
}
inputBamReader.close();
}
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" -o %s",
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0);
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
int totalReadGroupsSeen = 0;
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()),
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
totalReadGroupsSeen++;
}
Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file");
outputBamReader.close();
}
// On-the-fly sample renaming test case: two single-sample bams with read group collisions
@Test
public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException {
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878",
privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878"));
final Set<String> na12878ReadGroups = new HashSet<>();
final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam"));
for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) {
na12878ReadGroups.add(readGroup.getId());
}
inputBamReader.close();
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" -o %s",
1, Arrays.asList("")); // No MD5s; we only want to check the read groups
final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0);
final SAMFileReader outputBamReader = new SAMFileReader(outputBam);
int totalReadGroupsSeen = 0;
for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) {
String expectedSampleName = "";
if ( na12878ReadGroups.contains(readGroup.getId()) ) {
expectedSampleName = "newSampleFor12878";
}
else {
expectedSampleName = "newSampleForNot12878";
}
Assert.assertEquals(readGroup.getSample(), expectedSampleName,
String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId()));
totalReadGroupsSeen++;
}
Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file");
outputBamReader.close();
}
// On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException)
@Test
public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException {
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName"));
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" -o %s",
1,
UserException.class); // expecting a UserException here
executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec);
}
// On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads
@Test
public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException {
final File sampleRenameMapFile = createTestSampleRenameMapFile(
Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName"));
final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" +
" -R " + b37KGReference +
" -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" +
" --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() +
" --newSampleName myNewSampleName" +
" -L 20:10000000-10001000",
1, Arrays.asList(""));
// Test is a success if our custom walker doesn't throw an exception
executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec);
}
private File createTestSampleRenameMapFile( final List<String> contents ) throws IOException {
final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp");
final PrintWriter writer = new PrintWriter(mapFile);
for ( final String line : contents ) {
writer.println(line);
}
writer.close();
return mapFile;
}
public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true)
String newSampleName = null;
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) {
throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s",
newSampleName, read.getReadGroup().getSample()));
}
return 1;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) { return value + sum; }
}
}

View File

@ -42,10 +42,9 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
/**
* Tests selected functionality in the GenomeAnalysisEngine class
@ -104,6 +103,64 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
testEngine.validateSuppliedIntervals();
}
@Test
public void testLoadWellFormedSampleRenameMapFile() throws IOException {
final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1",
"/foo/bar/second.bam newSample2",
"/foo/bar2/third.bam newSample3"));
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
final Map<SAMReaderID, String> renameMap = engine.loadSampleRenameMap(mapFile);
Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file");
final Iterator<String> expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator();
while ( expectedResultsIterator.hasNext() ) {
final String expectedKey = expectedResultsIterator.next();
final String expectedValue = expectedResultsIterator.next();
Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey));
Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey);
}
}
@DataProvider(name = "MalformedSampleRenameMapFileDataProvider")
public Object[][] generateMalformedSampleRenameMapFiles() throws IOException {
final List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile",
new File("/foo/bar/nonexistent")});
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1",
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2",
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath",
createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath",
createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1",
"/path/to/dupe.bam newSample2"))});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class)
public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) {
logger.info("Executing test " + testName);
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
final Map<SAMReaderID, String> renameMap = engine.loadSampleRenameMap(mapFile);
}
private File createTestSampleRenameMapFile( final List<String> contents ) throws IOException {
final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp");
final PrintWriter writer = new PrintWriter(mapFile);
for ( final String line : contents ) {
writer.println(line);
}
writer.close();
return mapFile;
}
///////////////////////////////////////////////////
// Test the ReadTransformer ordering enforcement //

View File

@ -158,7 +158,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
false, (byte)30, false, true, null);
engine.setReadsDataSource(dataSource);
@ -193,7 +193,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
false, (byte)30, false, true, null);
engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
@ -234,7 +234,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
false, (byte)30, false, true, null);
engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
@ -281,7 +281,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(),
filters,
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
false, (byte)30, false, true, null);
engine.setReadsDataSource(dataSource);

View File

@ -183,7 +183,8 @@ public class SAMDataSourceUnitTest extends BaseTest {
false,
(byte) -1,
removeProgramRecords,
false);
false,
null);
List<SAMProgramRecord> dontRemoveProgramRecords = data.getHeader().getProgramRecords();
assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false");
@ -203,7 +204,8 @@ public class SAMDataSourceUnitTest extends BaseTest {
false,
(byte) -1,
removeProgramRecords,
false);
false,
null);
List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords();
assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true");

View File

@ -481,7 +481,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
false, (byte)30, false, true, null);
engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());