Support for original quality scores OQ flag. pQ flag in TableRecalibation to preserve quality scores below a threshold (defaulting to 5)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1474 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f0179109fa
commit
8e129d76fd
|
|
@ -109,13 +109,13 @@ public class CovariateCounter {
|
|||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref, boolean useOriginalQuals ) {
|
||||
if ( offset == 0 )
|
||||
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
|
||||
|
||||
int cycle = offset;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
|
||||
|
||||
char base = (char)bases[offset];
|
||||
char prevBase = (char)bases[offset - 1];
|
||||
|
|
|
|||
|
|
@ -41,6 +41,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||
public boolean collapseDinuc = false;
|
||||
|
||||
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean useOriginalQuals = false;
|
||||
|
||||
|
||||
private CovariateCounter covariateCounter = null;
|
||||
|
||||
private long counted_sites = 0; // number of sites used to count covariates
|
||||
|
|
@ -109,7 +113,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
|
||||
int offset = offsets.get(i);
|
||||
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||
counted_bases += covariateCounter.updateDataFromRead(readGroupString, read, offset, ref.getBase());
|
||||
counted_bases += covariateCounter.updateDataFromRead(readGroupString, read, offset, ref.getBase(), useOriginalQuals);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -235,33 +239,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
private void writeLogisticRecalibrationTable() {
|
||||
PrintStream dinuc_out = null;
|
||||
try {
|
||||
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
||||
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
||||
for (String readGroup : covariateCounter.getReadGroups()) {
|
||||
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
|
||||
for ( RecalData datum: covariateCounter.getRecalData(readGroup) ) {
|
||||
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
|
||||
if ((datum.N - datum.B) > 0)
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||
if (datum.B > 0)
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (FileNotFoundException e) {
|
||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||
}
|
||||
finally {
|
||||
if (dinuc_out != null) dinuc_out.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
||||
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -12,6 +15,9 @@ import java.util.*;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class RecalDataManager {
|
||||
/** The original quality scores are stored in this attribute */
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
|
||||
|
||||
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||
boolean trackPos, trackDinuc;
|
||||
String readGroup;
|
||||
|
|
@ -114,10 +120,6 @@ public class RecalDataManager {
|
|||
return d3 == null ? null : d3.get(dinucIndex);
|
||||
}
|
||||
|
||||
//private RecalData get(int posIndex, int qual, int dinucIndex) {
|
||||
// return data[posIndex][qual][dinucIndex];
|
||||
//}
|
||||
|
||||
private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
|
||||
// grow data if necessary
|
||||
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
|
||||
|
|
@ -137,31 +139,6 @@ public class RecalDataManager {
|
|||
d3.set(dinucIndex, datum);
|
||||
}
|
||||
|
||||
//private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
|
||||
// data[posIndex][qual][dinucIndex] = datum;
|
||||
//}
|
||||
|
||||
|
||||
/* public List<RecalData> select(int pos, int qual, int dinuc_index ) {
|
||||
List<RecalData> l = new LinkedList<RecalData>();
|
||||
for ( int i = 0; i < data.length; i++ ) {
|
||||
if ( i == pos || pos == -1 || ! trackPos ) {
|
||||
for ( int j = 0; j < data[i].length; j++ ) {
|
||||
if ( j == qual || qual == -1 ) {
|
||||
for ( int k = 0; k < data[i][j].length; k++ ) {
|
||||
if ( k == dinuc_index|| dinuc_index == -1 || ! trackDinuc ) {
|
||||
l.add(data[i][j][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return l;
|
||||
}
|
||||
*/
|
||||
|
||||
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
|
||||
boolean combinePos, boolean combineQual, boolean combineDinuc) {
|
||||
for ( RecalData find : l ) {
|
||||
|
|
@ -199,5 +176,20 @@ public class RecalDataManager {
|
|||
public List<RecalData> getAll() {
|
||||
return flattenData;
|
||||
}
|
||||
|
||||
// we can process the OQ field if requested
|
||||
public static byte[] getQualsForRecalibration( SAMRecord read, boolean useOriginalQuals ) {
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if ( useOriginalQuals && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
|
||||
return quals;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ import java.util.regex.Pattern;
|
|||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
|
|
@ -52,14 +53,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
||||
public boolean rawQempirical = false;
|
||||
|
||||
@Argument(shortName="adjustQ0Bases", doc="If provided, Q0 bases will have their quality scores modified, otherwise they will be left as Q0 in the output", required=false)
|
||||
public boolean adjustQ0Bases = false;
|
||||
@Argument(fullName="preserveQScoresLessThan", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
public int preserveQScoresLessThan = 5;
|
||||
|
||||
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean useOriginalQuals = false;
|
||||
|
||||
//
|
||||
// Basic static information
|
||||
//
|
||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||
|
||||
private static String VERSION = "0.2.3";
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
private static String VERSION = "0.2.4";
|
||||
|
||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||
|
|
@ -82,10 +86,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||
|
||||
//private static boolean DEBUG_ME = true;
|
||||
|
||||
public void initialize() {
|
||||
logger.info("TableRecalibrator version: " + VERSION);
|
||||
|
||||
//
|
||||
// crappy hack until Enum arg types are supported
|
||||
//
|
||||
|
|
@ -233,7 +236,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
||||
|
||||
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
|
||||
|
||||
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
|
|
@ -244,33 +249,30 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
try {
|
||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||
|
||||
//if ( read.getReadName().equals("IL12_395:7:215:171:693") ) {
|
||||
// for ( int i = 0; i < quals.length; i++ ) {
|
||||
// System.out.printf("READ found: %s is now %s%n", quals[i], recalQuals[i]);
|
||||
// }
|
||||
//}
|
||||
// Don't change Q scores below some threshold
|
||||
preserveQScores(quals, recalQuals, read);
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
||||
if (read.getReadNegativeStrandFlag()) { // reverse the quals for the neg strand read
|
||||
recalQuals = BaseUtils.reverse(recalQuals);
|
||||
|
||||
// special case the first and last bases in SOLID reads, which are always 0
|
||||
// We actually just never change Q0 bases
|
||||
preserveQ0Bases(quals, recalQuals, read);
|
||||
quals = BaseUtils.reverse(quals);
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals);
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) {
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(quals));
|
||||
}
|
||||
|
||||
return read;
|
||||
} catch ( StingException e ) {
|
||||
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
private void preserveQ0Bases( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
||||
if ( ! adjustQ0Bases ) {
|
||||
for ( int i = 0; i < recalQuals.length; i++ ) {
|
||||
//System.out.printf("Original qual %d => %d%n", originalQuals[i], recalQuals[i]);
|
||||
if ( originalQuals[i] == 0 ) {
|
||||
//System.out.printf("Preserving Q0 base at %d in read %s%n", i, read.getReadName());
|
||||
recalQuals[i] = 0;
|
||||
}
|
||||
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
||||
for ( int i = 0; i < recalQuals.length; i++ ) {
|
||||
if ( originalQuals[i] < preserveQScoresLessThan ) {
|
||||
System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
||||
recalQuals[i] = originalQuals[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -528,12 +530,6 @@ class SerialRecalMapping implements RecalMapping {
|
|||
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
|
||||
}
|
||||
|
||||
//if ( printStateP(pos, RecalData.dinucIndex2bases(index), qual) )
|
||||
// System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d%n",
|
||||
// readGroup, prevBase, base, cycle, qual,
|
||||
// qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc,
|
||||
// newQualByte);
|
||||
|
||||
return newQualByte;
|
||||
}
|
||||
}
|
||||
|
|
@ -206,4 +206,73 @@ public class QualityUtils {
|
|||
|
||||
return rcCompressedQuals;
|
||||
}
|
||||
|
||||
// TODO --
|
||||
// TODO --
|
||||
// TODO -- stolen from SAMUtils in picard -- remove when public access is granted
|
||||
// TODO --
|
||||
// TODO --
|
||||
|
||||
/**
|
||||
* Convert an array of bytes, in which each byte is a binary phred quality score, to
|
||||
* printable ASCII representation of the quality scores, ala FASTQ format.
|
||||
* @param buffer Array of bytes in which each byte is a binar phred score.
|
||||
* @param offset Where in buffer to start conversion.
|
||||
* @param length How many bytes of buffer to convert.
|
||||
* @return String with ASCII representation of those quality scores.
|
||||
*/
|
||||
public static String phredToFastq(final byte[] buffer, final int offset, final int length) {
|
||||
final char[] chars = new char[length];
|
||||
for (int i = 0; i < length; i++) {
|
||||
chars[i] = phredToFastq(buffer[offset+i] & 0xFF);
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
/** convenience -- should be pushed back into Picard */
|
||||
public static String phredToFastq(final byte[] buffer) {
|
||||
return phredToFastq(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a single binary phred score to printable ASCII representation, ala FASTQ format.
|
||||
* @param phredScore binary phred score.
|
||||
* @return Printable ASCII representation of phred score.
|
||||
*/
|
||||
public static char phredToFastq(final int phredScore) {
|
||||
if (phredScore < 0 || phredScore > 63) {
|
||||
throw new IllegalArgumentException("Cannot encode phred score: " + phredScore);
|
||||
}
|
||||
return (char) (33 + phredScore);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string with phred scores in printable ASCII FASTQ format to an array
|
||||
* of binary phred scores.
|
||||
* @param fastq Phred scores in FASTQ printable ASCII format.
|
||||
* @return byte array of binary phred scores in which each byte corresponds to a character in the input string.
|
||||
*/
|
||||
public static byte[] fastqToPhred(final String fastq) {
|
||||
if (fastq == null) {
|
||||
return null;
|
||||
}
|
||||
final int length = fastq.length();
|
||||
final byte[] scores = new byte[length];
|
||||
for (int i = 0; i < length; i++) {
|
||||
scores[i] = (byte) fastqToPhred(fastq.charAt(i));
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a single printable ASCII FASTQ format phred score to binary phred score.
|
||||
* @param ch Printable ASCII FASTQ format phred score.
|
||||
* @return Binary phred score.
|
||||
*/
|
||||
public static int fastqToPhred(final char ch) {
|
||||
if (ch < 33 || ch > 126) {
|
||||
throw new IllegalArgumentException("Invalid fastq character: " + ch);
|
||||
}
|
||||
return (ch - 33);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class CovariateCounterTest extends BaseTest {
|
|||
@Test
|
||||
public void testOneRead() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
|
|
@ -112,9 +112,9 @@ public class CovariateCounterTest extends BaseTest {
|
|||
@Test
|
||||
public void testTwoReads() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
|
|
@ -133,9 +133,9 @@ public class CovariateCounterTest extends BaseTest {
|
|||
@Test
|
||||
public void testTwoReadsSameGroup() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
|
|
@ -153,9 +153,9 @@ public class CovariateCounterTest extends BaseTest {
|
|||
@Test
|
||||
public void testTwoReadsSameGroupNotIdentical() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read3.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]);
|
||||
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
|
|
@ -177,6 +177,6 @@ public class CovariateCounterTest extends BaseTest {
|
|||
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
|
||||
|
||||
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]);
|
||||
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0], false);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue