Updated read group covariate to use sample.lane instead of the id
Added Unit test.
This commit is contained in:
parent
539da9e3e1
commit
f421062b55
|
|
@ -1,11 +1,13 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
|
@ -46,6 +48,10 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
||||||
private short nextId = 0;
|
private short nextId = 0;
|
||||||
|
|
||||||
|
private static final String LANE_TAG = "LN";
|
||||||
|
private static final String SAMPLE_TAG = "SM";
|
||||||
|
|
||||||
|
|
||||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||||
@Override
|
@Override
|
||||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||||
|
|
@ -54,14 +60,13 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
@Override
|
@Override
|
||||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||||
final int l = read.getReadLength();
|
final int l = read.getReadLength();
|
||||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
|
||||||
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
||||||
BitSet[] readGroups = new BitSet[l];
|
BitSet[] readGroups = new BitSet[l];
|
||||||
Arrays.fill(readGroups, rg);
|
Arrays.fill(readGroups, rg);
|
||||||
return new CovariateValues(readGroups, readGroups, readGroups);
|
return new CovariateValues(readGroups, readGroups, readGroups);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
|
||||||
@Override
|
@Override
|
||||||
public final Object getValue(final String str) {
|
public final Object getValue(final String str) {
|
||||||
return str;
|
return str;
|
||||||
|
|
@ -77,15 +82,15 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
return bitSetForReadGroup((String) key);
|
return bitSetForReadGroup((String) key);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final String decodeReadGroup(final short id) {
|
|
||||||
return readGroupReverseLookupTable.get(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int numberOfBits() {
|
public int numberOfBits() {
|
||||||
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
|
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String decodeReadGroup(final short id) {
|
||||||
|
return readGroupReverseLookupTable.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
private BitSet bitSetForReadGroup(String readGroupId) {
|
private BitSet bitSetForReadGroup(String readGroupId) {
|
||||||
short shortId;
|
short shortId;
|
||||||
if (readGroupLookupTable.containsKey(readGroupId))
|
if (readGroupLookupTable.containsKey(readGroupId))
|
||||||
|
|
@ -98,6 +103,35 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
}
|
}
|
||||||
return BitSetUtils.bitSetFrom(shortId);
|
return BitSetUtils.bitSetFrom(shortId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gather the sample and lane information from the read group record and return sample.lane
|
||||||
|
*
|
||||||
|
* If the bam file is missing the lane information, it tries to use the id regex standardized
|
||||||
|
* by the Broad Institute to extract the lane information
|
||||||
|
*
|
||||||
|
* If it fails to find either of the two pieces of information, will return the read group id instead.
|
||||||
|
*
|
||||||
|
* @param rg the read group record
|
||||||
|
* @return sample.lane or id if information is missing.
|
||||||
|
*/
|
||||||
|
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
|
||||||
|
String lane = rg.getLane(); // take the sample's lane from the read group lane tag
|
||||||
|
String sample = rg.getSample(); // take the sample's name from the read group sample tag
|
||||||
|
String value = rg.getId(); // initialize the return value with the read group ID in case we can't find the sample or the lane
|
||||||
|
|
||||||
|
if (lane == null) { // if this bam doesn't have the lane annotation in the read group try to take it from the read group id
|
||||||
|
String [] splitID = rg.getId().split(Pattern.quote("."));
|
||||||
|
if (splitID.length > 1) // if the id doesn't follow the BROAD defined regex (PU.LANE), fall back to the read group id
|
||||||
|
lane = splitID[splitID.length - 1]; // take the lane from the readgroup id
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sample != null && lane != null)
|
||||||
|
value = sample + "." + lane; // the read group covariate is sample.lane (where the inforamtion is available)
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.NGSPlatform;
|
||||||
*/
|
*/
|
||||||
public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
|
public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
|
||||||
|
|
||||||
|
public static String LANE_TAG = "LN";
|
||||||
|
|
||||||
// the SAMReadGroupRecord data we're caching
|
// the SAMReadGroupRecord data we're caching
|
||||||
private String mSample = null;
|
private String mSample = null;
|
||||||
private String mPlatform = null;
|
private String mPlatform = null;
|
||||||
|
|
@ -79,4 +81,12 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
|
||||||
|
|
||||||
return mNGSPlatform;
|
return mNGSPlatform;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getLane() {
|
||||||
|
return this.getAttribute(LANE_TAG);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLane(String lane) {
|
||||||
|
this.setAttribute(LANE_TAG, lane);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -698,6 +698,13 @@ public class ReadUtils {
|
||||||
return bases;
|
return bases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static GATKSAMRecord createRandomRead(int length) {
|
||||||
|
byte[] quals = ReadUtils.createRandomReadQuals(length);
|
||||||
|
byte[] bbases = ReadUtils.createRandomReadBases(length, true);
|
||||||
|
return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
||||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||||
int sequenceRecordIndex = 0;
|
int sequenceRecordIndex = 0;
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
||||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
|
|
@ -11,7 +9,6 @@ import org.testng.annotations.BeforeClass;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.Random;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Mauricio Carneiro
|
* @author Mauricio Carneiro
|
||||||
|
|
@ -20,22 +17,18 @@ import java.util.Random;
|
||||||
public class ContextCovariateUnitTest {
|
public class ContextCovariateUnitTest {
|
||||||
ContextCovariate covariate;
|
ContextCovariate covariate;
|
||||||
RecalibrationArgumentCollection RAC;
|
RecalibrationArgumentCollection RAC;
|
||||||
Random random;
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public void init() {
|
public void init() {
|
||||||
RAC = new RecalibrationArgumentCollection();
|
RAC = new RecalibrationArgumentCollection();
|
||||||
covariate = new ContextCovariate();
|
covariate = new ContextCovariate();
|
||||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
|
||||||
covariate.initialize(RAC);
|
covariate.initialize(RAC);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testSimpleContexts() {
|
public void testSimpleContexts() {
|
||||||
byte[] quals = ReadUtils.createRandomReadQuals(10000);
|
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
|
||||||
byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
|
|
||||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
|
||||||
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
||||||
CovariateValues values = covariate.getValues(read);
|
CovariateValues values = covariate.getValues(read);
|
||||||
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
||||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
@ -10,7 +8,6 @@ import org.testng.annotations.BeforeClass;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.Random;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Mauricio Carneiro
|
* @author Mauricio Carneiro
|
||||||
|
|
@ -19,22 +16,18 @@ import java.util.Random;
|
||||||
public class CycleCovariateUnitTest {
|
public class CycleCovariateUnitTest {
|
||||||
CycleCovariate covariate;
|
CycleCovariate covariate;
|
||||||
RecalibrationArgumentCollection RAC;
|
RecalibrationArgumentCollection RAC;
|
||||||
Random random;
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public void init() {
|
public void init() {
|
||||||
RAC = new RecalibrationArgumentCollection();
|
RAC = new RecalibrationArgumentCollection();
|
||||||
covariate = new CycleCovariate();
|
covariate = new CycleCovariate();
|
||||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
|
||||||
covariate.initialize(RAC);
|
covariate.initialize(RAC);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testSimpleCycles() {
|
public void testSimpleCycles() {
|
||||||
short readLength = 10;
|
short readLength = 10;
|
||||||
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
|
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
|
||||||
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
|
|
||||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
|
||||||
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||||
read.getReadGroup().setPlatform("illumina");
|
read.getReadGroup().setPlatform("illumina");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/1/12
|
||||||
|
*/
|
||||||
|
public class ReadGroupCovariateUnitTest {
|
||||||
|
ReadGroupCovariate covariate;
|
||||||
|
RecalibrationArgumentCollection RAC;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
RAC = new RecalibrationArgumentCollection();
|
||||||
|
covariate = new ReadGroupCovariate();
|
||||||
|
covariate.initialize(RAC);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testSingleRecord() {
|
||||||
|
final String expected = "SAMPLE.1";
|
||||||
|
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
|
||||||
|
rg.setSample("SAMPLE");
|
||||||
|
rg.setLane("1");
|
||||||
|
runTest(rg, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testMissingLane() {
|
||||||
|
final String expected = "SAMPLE.7";
|
||||||
|
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.7");
|
||||||
|
rg.setSample("SAMPLE");
|
||||||
|
runTest(rg, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testMissingSample() {
|
||||||
|
final String expected = "MY.ID";
|
||||||
|
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
|
||||||
|
rg.setLane("1");
|
||||||
|
runTest(rg, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void runTest(GATKSAMReadGroupRecord rg, String expected) {
|
||||||
|
GATKSAMRecord read = ReadUtils.createRandomRead(10);
|
||||||
|
read.setReadGroup(rg);
|
||||||
|
CovariateValues values = covariate.getValues(read);
|
||||||
|
verifyCovariateArray(values.getMismatches(), expected);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCovariateArray(BitSet[] values, String expected) {
|
||||||
|
for (BitSet value : values) {
|
||||||
|
String actual = covariate.keyFromBitSet(value);
|
||||||
|
Assert.assertEquals(actual, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue