Merge pull request #611 from broadinstitute/mm_otf_sample_rename_support_whitespacing_sample_names

Allow for whitespace in sample names when performing on-the-fly sample-renaming.
This commit is contained in:
droazen 2014-04-22 13:01:15 -04:00
commit 58c8b2dd84
3 changed files with 32 additions and 14 deletions

View File

@ -64,6 +64,7 @@ import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import org.broadinstitute.variant.vcf.VCFConstants;
import java.io.File;
import java.io.FileNotFoundException;
@ -892,7 +893,8 @@ public class GenomeAnalysisEngine {
/**
* Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory
* HashMap. This file must consist of lines with two whitespace-separated fields:
* HashMap. This file must consist of lines with two whitespace-separated fields, the second of which
* may contain whitespace:
*
* absolute_path_to_file new_sample_name
*
@ -910,7 +912,7 @@ public class GenomeAnalysisEngine {
try {
for ( final String line : new XReadLines(sampleRenameMapFile) ) {
final String[] tokens = line.split("\\s+");
final String[] tokens = line.split("\\s+", 2);
if ( tokens.length != 2 ) {
throw new UserException.MalformedFile(sampleRenameMapFile,
@ -919,7 +921,16 @@ public class GenomeAnalysisEngine {
}
final File inputFile = new File(tokens[0]);
final String newSampleName = tokens[1];
final String newSampleName = tokens[1].trim();
if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) {
throw new UserException.MalformedFile(sampleRenameMapFile, String.format(
"Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s",
VCFConstants.FIELD_SEPARATOR,
newSampleName,
line
));
}
if ( ! inputFile.isAbsolute() ) {
throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line);

View File

@ -363,9 +363,10 @@ public class GATKArgumentCollection {
/**
* On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must
* contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that
* BAM or VCF file. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only
* a single sample specified in its header (though, in the case of BAM files, there may be multiple read groups for
* that sample).
* BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be
* ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single
* sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that
* sample).
*/
@Advanced
@Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false)

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.ArgumentException;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
@ -126,13 +125,21 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
public void testLoadWellFormedSampleRenameMapFile() throws IOException {
final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1",
"/foo/bar/second.bam newSample2",
"/foo/bar2/third.bam newSample3"));
"/foo/bar2/third.bam newSample3",
"/foo/bar2/fourth.bam new sample 4",
"/foo/bar2/fifth.bam new sample 5 "));
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
final Map<String, String> renameMap = engine.loadSampleRenameMap(mapFile);
Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file");
Assert.assertEquals(renameMap.size(), 5, "Sample rename map was wrong size after loading from file");
final Iterator<String> expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator();
final Iterator<String> expectedResultsIterator = Arrays.asList(
"/foo/bar/first.bam", "newSample1",
"/foo/bar/second.bam", "newSample2",
"/foo/bar2/third.bam", "newSample3",
"/foo/bar2/fourth.bam", "new sample 4",
"/foo/bar2/fifth.bam", "new sample 5"
).iterator();
while ( expectedResultsIterator.hasNext() ) {
final String expectedKey = expectedResultsIterator.next();
final String expectedValue = expectedResultsIterator.next();
@ -148,16 +155,15 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile",
new File("/foo/bar/nonexistent")});
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1",
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine",
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2",
createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath",
createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath",
createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1",
"/path/to/dupe.bam newSample2"))});
tests.add(new Object[]{"testLoadSampleRenameMapFileTabInSampleName",
createTestSampleRenameMapFile(Arrays.asList("/path/to/stuff.bam some wonky\tsample "))});
return tests.toArray(new Object[][]{});
}