Updated read group covariate to use sample.lane instead of the id

Added Unit test.
This commit is contained in:
Mauricio Carneiro 2012-03-19 16:04:45 -04:00
parent 539da9e3e1
commit f421062b55
6 changed files with 129 additions and 25 deletions

View File

@ -1,11 +1,13 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.regex.Pattern;
/*
* Copyright (c) 2009 The Broad Institute
@ -45,6 +47,10 @@ public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
private short nextId = 0;
private static final String LANE_TAG = "LN";
private static final String SAMPLE_TAG = "SM";
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
@ -54,14 +60,13 @@ public class ReadGroupCovariate implements RequiredCovariate {
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
final int l = read.getReadLength();
final String readGroupId = read.getReadGroup().getReadGroupId();
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
BitSet[] readGroups = new BitSet[l];
Arrays.fill(readGroups, rg);
return new CovariateValues(readGroups, readGroups, readGroups);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return str;
@ -77,15 +82,15 @@ public class ReadGroupCovariate implements RequiredCovariate {
return bitSetForReadGroup((String) key);
}
public final String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
}
private String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id);
}
private BitSet bitSetForReadGroup(String readGroupId) {
short shortId;
if (readGroupLookupTable.containsKey(readGroupId))
@ -98,6 +103,35 @@ public class ReadGroupCovariate implements RequiredCovariate {
}
return BitSetUtils.bitSetFrom(shortId);
}
/**
* Gather the sample and lane information from the read group record and return sample.lane
*
* If the bam file is missing the lane information, it tries to use the id regex standardized
* by the Broad Institute to extract the lane information
*
* If it fails to find either of the two pieces of information, will return the read group id instead.
*
* @param rg the read group record
* @return sample.lane or id if information is missing.
*/
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
String lane = rg.getLane(); // take the sample's lane from the read group lane tag
String sample = rg.getSample(); // take the sample's name from the read group sample tag
String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane
if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id
String [] splitID = rg.getId().split(Pattern.quote("."));
if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id
lane = splitID[splitID.length - 1]; // take the lane from the readgroup id
}
if (sample != null && lane != null)
value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available)
return value;
}
}

View File

@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.NGSPlatform;
*/
public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
public static String LANE_TAG = "LN";
// the SAMReadGroupRecord data we're caching
private String mSample = null;
private String mPlatform = null;
@ -79,4 +81,12 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
return mNGSPlatform;
}
public String getLane() {
return this.getAttribute(LANE_TAG);
}
public void setLane(String lane) {
this.setAttribute(LANE_TAG, lane);
}
}

View File

@ -698,6 +698,13 @@ public class ReadUtils {
return bases;
}
public static GATKSAMRecord createRandomRead(int length) {
byte[] quals = ReadUtils.createRandomReadQuals(length);
byte[] bbases = ReadUtils.createRandomReadBases(length, true);
return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
}
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
int sequenceRecordIndex = 0;

View File

@ -1,9 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
@ -11,7 +9,6 @@ import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.BitSet;
import java.util.Random;
/**
* @author Mauricio Carneiro
@ -20,22 +17,18 @@ import java.util.Random;
public class ContextCovariateUnitTest {
ContextCovariate covariate;
RecalibrationArgumentCollection RAC;
Random random;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
covariate = new ContextCovariate();
random = GenomeAnalysisEngine.getRandomGenerator();
covariate.initialize(RAC);
}
@Test(enabled = true)
public void testSimpleContexts() {
byte[] quals = ReadUtils.createRandomReadQuals(10000);
byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
CovariateValues values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -10,7 +8,6 @@ import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.BitSet;
import java.util.Random;
/**
* @author Mauricio Carneiro
@ -19,22 +16,18 @@ import java.util.Random;
public class CycleCovariateUnitTest {
CycleCovariate covariate;
RecalibrationArgumentCollection RAC;
Random random;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
covariate = new CycleCovariate();
random = GenomeAnalysisEngine.getRandomGenerator();
covariate.initialize(RAC);
}
@Test(enabled = true)
public void testSimpleCycles() {
short readLength = 10;
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
short readLength = 10;
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
read.getReadGroup().setPlatform("illumina");

View File

@ -0,0 +1,67 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.BitSet;
/**
* @author Mauricio Carneiro
* @since 3/1/12
*/
public class ReadGroupCovariateUnitTest {
ReadGroupCovariate covariate;
RecalibrationArgumentCollection RAC;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
covariate = new ReadGroupCovariate();
covariate.initialize(RAC);
}
@Test(enabled = true)
public void testSingleRecord() {
final String expected = "SAMPLE.1";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
rg.setSample("SAMPLE");
rg.setLane("1");
runTest(rg, expected);
}
@Test(enabled = true)
public void testMissingLane() {
final String expected = "SAMPLE.7";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7");
rg.setSample("SAMPLE");
runTest(rg, expected);
}
@Test(enabled = true)
public void testMissingSample() {
final String expected = "MY.ID";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
rg.setLane("1");
runTest(rg, expected);
}
private void runTest(GATKSAMReadGroupRecord rg, String expected) {
GATKSAMRecord read = ReadUtils.createRandomRead(10);
read.setReadGroup(rg);
CovariateValues values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), expected);
}
private void verifyCovariateArray(BitSet[] values, String expected) {
for (BitSet value : values) {
String actual = covariate.keyFromBitSet(value);
Assert.assertEquals(actual, expected);
}
}
}