From f421062b5594fd47c39a4a274ae65784c494fded Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Mar 2012 16:04:45 -0400 Subject: [PATCH] Updated read group covariate to use sample.lane instead of the id Added Unit test. --- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 50 +++++++++++--- .../utils/sam/GATKSAMReadGroupRecord.java | 10 +++ .../sting/utils/sam/ReadUtils.java | 7 ++ .../bqsr/ContextCovariateUnitTest.java | 9 +-- .../walkers/bqsr/CycleCovariateUnitTest.java | 11 +-- .../bqsr/ReadGroupCovariateUnitTest.java | 67 +++++++++++++++++++ 6 files changed, 129 insertions(+), 25 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index ad4f94f33..eb20f7779 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -1,11 +1,13 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; import java.util.BitSet; import java.util.HashMap; +import java.util.regex.Pattern; /* * Copyright (c) 2009 The Broad Institute @@ -45,6 +47,10 @@ public class ReadGroupCovariate implements RequiredCovariate { private final HashMap readGroupLookupTable = new HashMap(); private final HashMap readGroupReverseLookupTable = new HashMap(); private short nextId = 0; + + private static final String LANE_TAG = "LN"; + private static final String SAMPLE_TAG = "SM"; + // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -54,14 +60,13 @@ public class ReadGroupCovariate implements RequiredCovariate { @Override public CovariateValues getValues(final GATKSAMRecord read) { final int l = read.getReadLength(); - final String readGroupId = read.getReadGroup().getReadGroupId(); - BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset + final String readGroupId = readGroupValueFromRG(read.getReadGroup()); + BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset BitSet[] readGroups = new BitSet[l]; Arrays.fill(readGroups, rg); return new CovariateValues(readGroups, readGroups, readGroups); } - // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override public final Object getValue(final String str) { return str; @@ -77,15 +82,15 @@ public class ReadGroupCovariate implements RequiredCovariate { return bitSetForReadGroup((String) key); } - public final String decodeReadGroup(final short id) { - return readGroupReverseLookupTable.get(id); - } - @Override public int numberOfBits() { return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE); } - + + private String decodeReadGroup(final short id) { + return readGroupReverseLookupTable.get(id); + } + private BitSet bitSetForReadGroup(String readGroupId) { short shortId; if (readGroupLookupTable.containsKey(readGroupId)) @@ -98,6 +103,35 @@ public class ReadGroupCovariate implements RequiredCovariate { } return BitSetUtils.bitSetFrom(shortId); } + + /** + * Gather the sample and lane information from the read group record and return sample.lane + * + * If the bam file is missing the lane information, it tries to use the id regex standardized + * by the Broad Institute to extract the lane information + * + * If it fails to find either of the two pieces of information, will return the read group id instead. + * + * @param rg the read group record + * @return sample.lane or id if information is missing. + */ + private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) { + String lane = rg.getLane(); // take the sample's lane from the read group lane tag + String sample = rg.getSample(); // take the sample's name from the read group sample tag + String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane + + if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id + String [] splitID = rg.getId().split(Pattern.quote(".")); + if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id + lane = splitID[splitID.length - 1]; // take the lane from the readgroup id + } + + if (sample != null && lane != null) + value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available) + + return value; + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index ff7d12f09..df1ff2a0e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.NGSPlatform; */ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { + public static String LANE_TAG = "LN"; + // the SAMReadGroupRecord data we're caching private String mSample = null; private String mPlatform = null; @@ -79,4 +81,12 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { return mNGSPlatform; } + + public String getLane() { + return this.getAttribute(LANE_TAG); + } + + public void setLane(String lane) { + this.setAttribute(LANE_TAG, lane); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 9d731e489..cbb4120dd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -698,6 +698,13 @@ public class ReadUtils { return bases; } + public static GATKSAMRecord createRandomRead(int length) { + byte[] quals = ReadUtils.createRandomReadQuals(length); + byte[] bbases = ReadUtils.createRandomReadBases(length, true); + return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + } + + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { String[] sequenceRecordNames = new String[sequenceDictionary.size()]; int sequenceRecordIndex = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index 30a9bad3e..2b4cb2ae3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -1,9 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; @@ -11,7 +9,6 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.util.BitSet; -import java.util.Random; /** * @author Mauricio Carneiro @@ -20,22 +17,18 @@ import java.util.Random; public class ContextCovariateUnitTest { ContextCovariate covariate; RecalibrationArgumentCollection RAC; - Random random; @BeforeClass public void init() { RAC = new RecalibrationArgumentCollection(); covariate = new ContextCovariate(); - random = GenomeAnalysisEngine.getRandomGenerator(); covariate.initialize(RAC); } @Test(enabled = true) public void testSimpleContexts() { - byte[] quals = ReadUtils.createRandomReadQuals(10000); - byte[] bbases = ReadUtils.createRandomReadBases(10000, true); - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + GATKSAMRecord read = ReadUtils.createRandomRead(1000); GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); CovariateValues values = covariate.getValues(read); verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java index 49315672c..d80cddd3e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -10,7 +8,6 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.util.BitSet; -import java.util.Random; /** * @author Mauricio Carneiro @@ -19,22 +16,18 @@ import java.util.Random; public class CycleCovariateUnitTest { CycleCovariate covariate; RecalibrationArgumentCollection RAC; - Random random; @BeforeClass public void init() { RAC = new RecalibrationArgumentCollection(); covariate = new CycleCovariate(); - random = GenomeAnalysisEngine.getRandomGenerator(); covariate.initialize(RAC); } @Test(enabled = true) public void testSimpleCycles() { - short readLength = 10; - byte[] quals = ReadUtils.createRandomReadQuals(readLength); - byte[] bbases = ReadUtils.createRandomReadBases(readLength, true); - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + short readLength = 10; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); read.getReadGroup().setPlatform("illumina"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..6276022d1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java @@ -0,0 +1,67 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.BitSet; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setSample("SAMPLE"); + rg.setLane("1"); + runTest(rg, expected); + } + + @Test(enabled = true) + public void testMissingLane() { + final String expected = "SAMPLE.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7"); + rg.setSample("SAMPLE"); + runTest(rg, expected); + } + + @Test(enabled = true) + public void testMissingSample() { + final String expected = "MY.ID"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setLane("1"); + runTest(rg, expected); + } + + private void runTest(GATKSAMReadGroupRecord rg, String expected) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + CovariateValues values = covariate.getValues(read); + verifyCovariateArray(values.getMismatches(), expected); + + } + + private void verifyCovariateArray(BitSet[] values, String expected) { + for (BitSet value : values) { + String actual = covariate.keyFromBitSet(value); + Assert.assertEquals(actual, expected); + } + } + +}