Updated read group covariate to use sample.lane instead of the id
Added Unit test.
This commit is contained in:
parent
539da9e3e1
commit
f421062b55
|
|
@ -1,11 +1,13 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -45,6 +47,10 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
|
||||
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
||||
private short nextId = 0;
|
||||
|
||||
private static final String LANE_TAG = "LN";
|
||||
private static final String SAMPLE_TAG = "SM";
|
||||
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
|
|
@ -54,14 +60,13 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
final int l = read.getReadLength();
|
||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
||||
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
|
||||
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
||||
BitSet[] readGroups = new BitSet[l];
|
||||
Arrays.fill(readGroups, rg);
|
||||
return new CovariateValues(readGroups, readGroups, readGroups);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return str;
|
||||
|
|
@ -77,15 +82,15 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
return bitSetForReadGroup((String) key);
|
||||
}
|
||||
|
||||
public final String decodeReadGroup(final short id) {
|
||||
return readGroupReverseLookupTable.get(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numberOfBits() {
|
||||
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
private String decodeReadGroup(final short id) {
|
||||
return readGroupReverseLookupTable.get(id);
|
||||
}
|
||||
|
||||
private BitSet bitSetForReadGroup(String readGroupId) {
|
||||
short shortId;
|
||||
if (readGroupLookupTable.containsKey(readGroupId))
|
||||
|
|
@ -98,6 +103,35 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
}
|
||||
return BitSetUtils.bitSetFrom(shortId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gather the sample and lane information from the read group record and return sample.lane
|
||||
*
|
||||
* If the bam file is missing the lane information, it tries to use the id regex standardized
|
||||
* by the Broad Institute to extract the lane information
|
||||
*
|
||||
* If it fails to find either of the two pieces of information, will return the read group id instead.
|
||||
*
|
||||
* @param rg the read group record
|
||||
* @return sample.lane or id if information is missing.
|
||||
*/
|
||||
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
|
||||
String lane = rg.getLane(); // take the sample's lane from the read group lane tag
|
||||
String sample = rg.getSample(); // take the sample's name from the read group sample tag
|
||||
String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane
|
||||
|
||||
if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id
|
||||
String [] splitID = rg.getId().split(Pattern.quote("."));
|
||||
if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id
|
||||
lane = splitID[splitID.length - 1]; // take the lane from the readgroup id
|
||||
}
|
||||
|
||||
if (sample != null && lane != null)
|
||||
value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available)
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.NGSPlatform;
|
|||
*/
|
||||
public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
|
||||
|
||||
public static String LANE_TAG = "LN";
|
||||
|
||||
// the SAMReadGroupRecord data we're caching
|
||||
private String mSample = null;
|
||||
private String mPlatform = null;
|
||||
|
|
@ -79,4 +81,12 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
|
|||
|
||||
return mNGSPlatform;
|
||||
}
|
||||
|
||||
public String getLane() {
|
||||
return this.getAttribute(LANE_TAG);
|
||||
}
|
||||
|
||||
public void setLane(String lane) {
|
||||
this.setAttribute(LANE_TAG, lane);
|
||||
}
|
||||
}
|
||||
|
|
@ -698,6 +698,13 @@ public class ReadUtils {
|
|||
return bases;
|
||||
}
|
||||
|
||||
public static GATKSAMRecord createRandomRead(int length) {
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(length);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(length, true);
|
||||
return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
}
|
||||
|
||||
|
||||
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||
int sequenceRecordIndex = 0;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -11,7 +9,6 @@ import org.testng.annotations.BeforeClass;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
|
|
@ -20,22 +17,18 @@ import java.util.Random;
|
|||
public class ContextCovariateUnitTest {
|
||||
ContextCovariate covariate;
|
||||
RecalibrationArgumentCollection RAC;
|
||||
Random random;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
RAC = new RecalibrationArgumentCollection();
|
||||
covariate = new ContextCovariate();
|
||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
covariate.initialize(RAC);
|
||||
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testSimpleContexts() {
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(10000);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
|
||||
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
||||
CovariateValues values = covariate.getValues(read);
|
||||
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -10,7 +8,6 @@ import org.testng.annotations.BeforeClass;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
|
|
@ -19,22 +16,18 @@ import java.util.Random;
|
|||
public class CycleCovariateUnitTest {
|
||||
CycleCovariate covariate;
|
||||
RecalibrationArgumentCollection RAC;
|
||||
Random random;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
RAC = new RecalibrationArgumentCollection();
|
||||
covariate = new CycleCovariate();
|
||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
covariate.initialize(RAC);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testSimpleCycles() {
|
||||
short readLength = 10;
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
short readLength = 10;
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||
read.getReadGroup().setPlatform("illumina");
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/1/12
|
||||
*/
|
||||
public class ReadGroupCovariateUnitTest {
|
||||
ReadGroupCovariate covariate;
|
||||
RecalibrationArgumentCollection RAC;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
RAC = new RecalibrationArgumentCollection();
|
||||
covariate = new ReadGroupCovariate();
|
||||
covariate.initialize(RAC);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testSingleRecord() {
|
||||
final String expected = "SAMPLE.1";
|
||||
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
|
||||
rg.setSample("SAMPLE");
|
||||
rg.setLane("1");
|
||||
runTest(rg, expected);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMissingLane() {
|
||||
final String expected = "SAMPLE.7";
|
||||
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7");
|
||||
rg.setSample("SAMPLE");
|
||||
runTest(rg, expected);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMissingSample() {
|
||||
final String expected = "MY.ID";
|
||||
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
|
||||
rg.setLane("1");
|
||||
runTest(rg, expected);
|
||||
}
|
||||
|
||||
private void runTest(GATKSAMReadGroupRecord rg, String expected) {
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(10);
|
||||
read.setReadGroup(rg);
|
||||
CovariateValues values = covariate.getValues(read);
|
||||
verifyCovariateArray(values.getMismatches(), expected);
|
||||
|
||||
}
|
||||
|
||||
private void verifyCovariateArray(BitSet[] values, String expected) {
|
||||
for (BitSet value : values) {
|
||||
String actual = covariate.keyFromBitSet(value);
|
||||
Assert.assertEquals(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue