Added - PairedQualityScoreCountsWalker: counts quality scores (e.g. as a histogram) on first reads of a pair and second reads of a pair. Turns out there's a consistent difference in quality scores; even after recalibrating without the pair ordering as a covariate (there's a bit of averaging -- but not as much as I initially thought).

Added - A paired read order covariate to use with recalibration. Currently experimental: for instance, what's a proper pair versus just a pair? Nobody should use this one...



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2401 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-12-18 15:01:01 +00:00
parent 4f59bfd513
commit 7b5e332ff3
2 changed files with 135 additions and 0 deletions

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Dec 16, 2009
* Time: 3:22:19 PM
* To change this template use File | Settings | File Templates.
*/
public class PairedReadOrderCovariate implements ExperimentalCovariate{
public void initialize (final RecalibrationArgumentCollection rac ) { /* do nothing */ }
public final Comparable getValue(final SAMRecord read, final int offset) {
return read.getReadPairedFlag() ? "Not_Paired" : read.getMateUnmappedFlag() ? "Mate_Unmapped" : read.getFirstOfPairFlag() ? "First_Read" : "Second_Read";
}
public final Comparable getValue( final String str ) {
return Integer.parseInt( str );
}
// Used to estimate the amount space required for the full data HashMap
public final int estimatedNumberOfBins() {
return 4;
}
}

View File

@ -0,0 +1,107 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import net.sf.samtools.SAMRecord;
/**
* This walker prints out quality score counts for first and second reads of a pair aggregated over all reads
* in the interval.
*
* @Author: Chris Hartl
*/
public class PairedQualityScoreCountsWalker extends ReadWalker<Pair<byte[],Boolean>,Pair<CycleQualCounts,CycleQualCounts>> {
@Argument(fullName="readLength", shortName="rl", doc="Length of reads in the bam file", required=true)
public int readLength = -1;
public void initialize() { return; }
public Pair<CycleQualCounts,CycleQualCounts> reduceInit() {
return new Pair<CycleQualCounts,CycleQualCounts>( new CycleQualCounts(readLength), new CycleQualCounts(readLength) );
}
public Pair<CycleQualCounts,CycleQualCounts> reduce( Pair<byte[],Boolean> mapCounts, Pair<CycleQualCounts,CycleQualCounts> reduceCounts ) {
if ( mapCounts != null ) {
if ( mapCounts.second ) {
reduceCounts.first.update(mapCounts.first);
} else {
reduceCounts.second.update(mapCounts.first);
}
}
return reduceCounts;
}
public Pair<byte[],Boolean> map( char[] ref, SAMRecord read) {
if ( canUseRead(read) ) {
return getCorrectlyOrientedBaseQualities(read);
} else {
return null;
}
}
private boolean canUseRead(SAMRecord read) {
return ( ! read.getMateUnmappedFlag() && ! read.getReadUnmappedFlag() ) && ( read.getReadPairedFlag() && read.getReadLength() == readLength );
}
private Pair<byte[],Boolean> getCorrectlyOrientedBaseQualities(SAMRecord read) {
byte[] quals = read.getReadNegativeStrandFlag() ? BaseUtils.reverse(read.getBaseQualities()) : read.getBaseQualities();
return new Pair<byte[], Boolean>(quals, read.getFirstOfPairFlag());
}
public void onTraversalDone(Pair<CycleQualCounts,CycleQualCounts> finalCounts) {
StringBuilder output = new StringBuilder();
output.append(String.format("%s\t%s\t%s%n","Cycle","First_read_counts","Second_read_counts"));
for ( int offset = 0; offset < readLength; offset++ ) {
output.append(String.format("%d\t%s\t%s%n",offset,finalCounts.first.getCountDistribution(offset),finalCounts.second.getCountDistribution(offset)));
}
out.printf("%s",output.toString());
}
}
class CycleQualCounts {
private long[][] qualityCountsByCycle;
private int cycleLength;
private int qualMax = QualityUtils.MAX_REASONABLE_Q_SCORE + 1;
public CycleQualCounts(int cycleLength) {
this.cycleLength = cycleLength;
qualityCountsByCycle = new long[cycleLength][qualMax];
for ( int cycle = 0; cycle < cycleLength; cycle++ ) {
for ( int qual = 0; qual < qualMax; qual++) {
qualityCountsByCycle[cycle][qual] = 0;
}
}
}
public void update(int offset, byte quality) {
qualityCountsByCycle[offset][qualityToQualityIndex(quality)]++;
}
public void update(byte[] qualArray) {
for ( int o = 0; o < cycleLength; o++ ) {
update(o,qualArray[o]);
}
}
private int qualityToQualityIndex(byte qual) {
return qual < 0 ? 0 : qual > qualMax ? qualMax : qual;
}
public long[][] getCounts() { return qualityCountsByCycle; }
public String getCountDistribution(int offset) {
StringBuilder b = new StringBuilder();
for ( int qual = 0; qual < qualMax-1; qual++ ) {
b.append(String.format("%d;",qualityCountsByCycle[offset][qual]));
}
b.append(String.format("%d",qualityCountsByCycle[offset][qualMax-1]));
return b.toString();
}
}