From 8290d3c8acd73150d2a496264eae7417e448db29 Mon Sep 17 00:00:00 2001 From: Michael McCowan Date: Fri, 18 Apr 2014 13:55:41 -0400 Subject: [PATCH] Allow for non-tab whitespace in sample names when performing on-the-fly sample-renaming. --- .../sting/gatk/GenomeAnalysisEngine.java | 17 +++++++++++--- .../arguments/GATKArgumentCollection.java | 7 +++--- .../gatk/GenomeAnalysisEngineUnitTest.java | 22 ++++++++++++------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 97495a3f9..9eee9b7e4 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -64,6 +64,7 @@ import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; +import org.broadinstitute.variant.vcf.VCFConstants; import java.io.File; import java.io.FileNotFoundException; @@ -892,7 +893,8 @@ public class GenomeAnalysisEngine { /** * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory - * HashMap. This file must consist of lines with two whitespace-separated fields: + * HashMap. This file must consist of lines with two whitespace-separated fields, the second of which + * may contain whitespace: * * absolute_path_to_file new_sample_name * @@ -910,7 +912,7 @@ public class GenomeAnalysisEngine { try { for ( final String line : new XReadLines(sampleRenameMapFile) ) { - final String[] tokens = line.split("\\s+"); + final String[] tokens = line.split("\\s+", 2); if ( tokens.length != 2 ) { throw new UserException.MalformedFile(sampleRenameMapFile, @@ -919,7 +921,16 @@ public class GenomeAnalysisEngine { } final File inputFile = new File(tokens[0]); - final String newSampleName = tokens[1]; + final String newSampleName = tokens[1].trim(); + + if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) { + throw new UserException.MalformedFile(sampleRenameMapFile, String.format( + "Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s", + VCFConstants.FIELD_SEPARATOR, + newSampleName, + line + )); + } if ( ! inputFile.isAbsolute() ) { throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line); diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index f333dcffa..8e936d853 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -363,9 +363,10 @@ public class GATKArgumentCollection { /** * On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must * contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that - * BAM or VCF file. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only - * a single sample specified in its header (though, in the case of BAM files, there may be multiple read groups for - * that sample). + * BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be + * ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single + * sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that + * sample). */ @Advanced @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 21a18f804..995a7ccb4 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; @@ -126,13 +125,21 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { public void testLoadWellFormedSampleRenameMapFile() throws IOException { final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", "/foo/bar/second.bam newSample2", - "/foo/bar2/third.bam newSample3")); + "/foo/bar2/third.bam newSample3", + "/foo/bar2/fourth.bam new sample 4", + "/foo/bar2/fifth.bam new sample 5 ")); final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); final Map renameMap = engine.loadSampleRenameMap(mapFile); - Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); + Assert.assertEquals(renameMap.size(), 5, "Sample rename map was wrong size after loading from file"); - final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); + final Iterator expectedResultsIterator = Arrays.asList( + "/foo/bar/first.bam", "newSample1", + "/foo/bar/second.bam", "newSample2", + "/foo/bar2/third.bam", "newSample3", + "/foo/bar2/fourth.bam", "new sample 4", + "/foo/bar2/fifth.bam", "new sample 5" + ).iterator(); while ( expectedResultsIterator.hasNext() ) { final String expectedKey = expectedResultsIterator.next(); final String expectedValue = expectedResultsIterator.next(); @@ -148,16 +155,15 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", new File("/foo/bar/nonexistent")}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine", createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", "/path/to/dupe.bam newSample2"))}); - + tests.add(new Object[]{"testLoadSampleRenameMapFileTabInSampleName", + createTestSampleRenameMapFile(Arrays.asList("/path/to/stuff.bam some wonky\tsample "))}); return tests.toArray(new Object[][]{}); }