Support for original quality scores OQ flag. pQ flag in TableRecalibation to preserve quality scores below a threshold (defaulting to 5)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1474 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-08-28 14:14:21 +00:00
parent f0179109fa
commit 8e129d76fd
6 changed files with 134 additions and 100 deletions

View File

@ -109,13 +109,13 @@ public class CovariateCounter {
* @param ref
* @return
*/
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref, boolean useOriginalQuals ) {
if ( offset == 0 )
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
int cycle = offset;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
char base = (char)bases[offset];
char prevBase = (char)bases[offset - 1];

View File

@ -41,6 +41,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean useOriginalQuals = false;
private CovariateCounter covariateCounter = null;
private long counted_sites = 0; // number of sites used to count covariates
@ -109,7 +113,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
int offset = offsets.get(i);
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
counted_bases += covariateCounter.updateDataFromRead(readGroupString, read, offset, ref.getBase());
counted_bases += covariateCounter.updateDataFromRead(readGroupString, read, offset, ref.getBase(), useOriginalQuals);
}
}
}
@ -235,33 +239,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
}
@Deprecated
private void writeLogisticRecalibrationTable() {
PrintStream dinuc_out = null;
try {
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
for (String readGroup : covariateCounter.getReadGroups()) {
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
for ( RecalData datum: covariateCounter.getRecalData(readGroup) ) {
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
if ((datum.N - datum.B) > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
}
}
}
}
}
catch (FileNotFoundException e) {
System.err.println("FileNotFoundException: " + e.getMessage());
}
finally {
if (dinuc_out != null) dinuc_out.close();
}
}
/**
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.

View File

@ -1,9 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -12,6 +15,9 @@ import java.util.*;
* To change this template use File | Settings | File Templates.
*/
public class RecalDataManager {
/** The original quality scores are stored in this attribute */
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
boolean trackPos, trackDinuc;
String readGroup;
@ -114,10 +120,6 @@ public class RecalDataManager {
return d3 == null ? null : d3.get(dinucIndex);
}
//private RecalData get(int posIndex, int qual, int dinucIndex) {
// return data[posIndex][qual][dinucIndex];
//}
private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
// grow data if necessary
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
@ -137,31 +139,6 @@ public class RecalDataManager {
d3.set(dinucIndex, datum);
}
//private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
// data[posIndex][qual][dinucIndex] = datum;
//}
/* public List<RecalData> select(int pos, int qual, int dinuc_index ) {
List<RecalData> l = new LinkedList<RecalData>();
for ( int i = 0; i < data.length; i++ ) {
if ( i == pos || pos == -1 || ! trackPos ) {
for ( int j = 0; j < data[i].length; j++ ) {
if ( j == qual || qual == -1 ) {
for ( int k = 0; k < data[i][j].length; k++ ) {
if ( k == dinuc_index|| dinuc_index == -1 || ! trackDinuc ) {
l.add(data[i][j][k]);
}
}
}
}
}
}
return l;
}
*/
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
boolean combinePos, boolean combineQual, boolean combineDinuc) {
for ( RecalData find : l ) {
@ -199,5 +176,20 @@ public class RecalDataManager {
public List<RecalData> getAll() {
return flattenData;
}
// we can process the OQ field if requested
public static byte[] getQualsForRecalibration( SAMRecord read, boolean useOriginalQuals ) {
byte[] quals = read.getBaseQualities();
if ( useOriginalQuals && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
return quals;
}
}

View File

@ -39,6 +39,7 @@ import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
import java.lang.reflect.Method;
@WalkerName("TableRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
@ -52,14 +53,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
public boolean rawQempirical = false;
@Argument(shortName="adjustQ0Bases", doc="If provided, Q0 bases will have their quality scores modified, otherwise they will be left as Q0 in the output", required=false)
public boolean adjustQ0Bases = false;
@Argument(fullName="preserveQScoresLessThan", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
public int preserveQScoresLessThan = 5;
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean useOriginalQuals = false;
//
// Basic static information
//
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
private static String VERSION = "0.2.3";
private final static boolean DEBUG = false;
private static String VERSION = "0.2.4";
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
@ -82,10 +86,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
//private static boolean DEBUG_ME = true;
public void initialize() {
logger.info("TableRecalibrator version: " + VERSION);
//
// crappy hack until Enum arg types are supported
//
@ -233,7 +236,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
public SAMRecord map(char[] ref, SAMRecord read) {
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
if (read.getReadNegativeStrandFlag()) {
@ -244,33 +249,30 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
try {
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
//if ( read.getReadName().equals("IL12_395:7:215:171:693") ) {
// for ( int i = 0; i < quals.length; i++ ) {
// System.out.printf("READ found: %s is now %s%n", quals[i], recalQuals[i]);
// }
//}
// Don't change Q scores below some threshold
preserveQScores(quals, recalQuals, read);
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
if (read.getReadNegativeStrandFlag()) { // reverse the quals for the neg strand read
recalQuals = BaseUtils.reverse(recalQuals);
// special case the first and last bases in SOLID reads, which are always 0
// We actually just never change Q0 bases
preserveQ0Bases(quals, recalQuals, read);
quals = BaseUtils.reverse(quals);
}
read.setBaseQualities(recalQuals);
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) {
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(quals));
}
return read;
} catch ( StingException e ) {
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
}
}
private void preserveQ0Bases( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
if ( ! adjustQ0Bases ) {
for ( int i = 0; i < recalQuals.length; i++ ) {
//System.out.printf("Original qual %d => %d%n", originalQuals[i], recalQuals[i]);
if ( originalQuals[i] == 0 ) {
//System.out.printf("Preserving Q0 base at %d in read %s%n", i, read.getReadName());
recalQuals[i] = 0;
}
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
for ( int i = 0; i < recalQuals.length; i++ ) {
if ( originalQuals[i] < preserveQScoresLessThan ) {
System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
recalQuals[i] = originalQuals[i];
}
}
}
@ -528,12 +530,6 @@ class SerialRecalMapping implements RecalMapping {
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
}
//if ( printStateP(pos, RecalData.dinucIndex2bases(index), qual) )
// System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d%n",
// readGroup, prevBase, base, cycle, qual,
// qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc,
// newQualByte);
return newQualByte;
}
}

View File

@ -206,4 +206,73 @@ public class QualityUtils {
return rcCompressedQuals;
}
// TODO --
// TODO --
// TODO -- stolen from SAMUtils in picard -- remove when public access is granted
// TODO --
// TODO --
/**
* Convert an array of bytes, in which each byte is a binary phred quality score, to
* printable ASCII representation of the quality scores, ala FASTQ format.
* @param buffer Array of bytes in which each byte is a binar phred score.
* @param offset Where in buffer to start conversion.
* @param length How many bytes of buffer to convert.
* @return String with ASCII representation of those quality scores.
*/
public static String phredToFastq(final byte[] buffer, final int offset, final int length) {
final char[] chars = new char[length];
for (int i = 0; i < length; i++) {
chars[i] = phredToFastq(buffer[offset+i] & 0xFF);
}
return new String(chars);
}
/** convenience -- should be pushed back into Picard */
public static String phredToFastq(final byte[] buffer) {
return phredToFastq(buffer, 0, buffer.length);
}
/**
* Convert a single binary phred score to printable ASCII representation, ala FASTQ format.
* @param phredScore binary phred score.
* @return Printable ASCII representation of phred score.
*/
public static char phredToFastq(final int phredScore) {
if (phredScore < 0 || phredScore > 63) {
throw new IllegalArgumentException("Cannot encode phred score: " + phredScore);
}
return (char) (33 + phredScore);
}
/**
* Convert a string with phred scores in printable ASCII FASTQ format to an array
* of binary phred scores.
* @param fastq Phred scores in FASTQ printable ASCII format.
* @return byte array of binary phred scores in which each byte corresponds to a character in the input string.
*/
public static byte[] fastqToPhred(final String fastq) {
if (fastq == null) {
return null;
}
final int length = fastq.length();
final byte[] scores = new byte[length];
for (int i = 0; i < length; i++) {
scores[i] = (byte) fastqToPhred(fastq.charAt(i));
}
return scores;
}
/**
* Convert a single printable ASCII FASTQ format phred score to binary phred score.
* @param ch Printable ASCII FASTQ format phred score.
* @return Binary phred score.
*/
public static int fastqToPhred(final char ch) {
if (ch < 33 || ch > 126) {
throw new IllegalArgumentException("Invalid fastq character: " + ch);
}
return (ch - 33);
}
}

View File

@ -93,7 +93,7 @@ public class CovariateCounterTest extends BaseTest {
@Test
public void testOneRead() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
@ -112,9 +112,9 @@ public class CovariateCounterTest extends BaseTest {
@Test
public void testTwoReads() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]);
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i], false);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
@ -133,9 +133,9 @@ public class CovariateCounterTest extends BaseTest {
@Test
public void testTwoReadsSameGroup() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
@ -153,9 +153,9 @@ public class CovariateCounterTest extends BaseTest {
@Test
public void testTwoReadsSameGroupNotIdentical() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read3.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]);
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i], false);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
@ -177,6 +177,6 @@ public class CovariateCounterTest extends BaseTest {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]);
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0], false);
}
}