basic clustering of reads to reduce artifacts

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@873 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kcibul 2009-06-02 02:54:21 +00:00
parent e712d69382
commit c4cb867d74
1 changed files with 650 additions and 75 deletions

View File

@ -1,20 +1,25 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.playground.indels.SWPairwiseAlignment;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.io.IOException;
import java.util.*;
@By(DataSource.REFERENCE)
public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
@ -78,10 +83,91 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
public static int MIN_QSCORE = 13;
public static double TUMOR_LOD = 6.3d;
// public static double TUMOR_LOD = 1.0d;
// public static double TUMOR_LOD = 6.3d;
public static double TUMOR_LOD = 4.0d;
public static double NORMAL_LOD = 2.3d;
private String cachedContigName;
private byte[] cachedContig;
private boolean outputFailures = true;
private static class LocusReadPile {
private char refBase;
public List<SAMRecord> reads = new ArrayList<SAMRecord>();
public List<Integer> offsets = new ArrayList<Integer>();
public GenotypeLikelihoods likelihoods = new GenotypeLikelihoods();
public QualitySums qualitySums = new QualitySums();
public LocusReadPile(char refBase) {
this.refBase = refBase;
}
public void add(SAMRecord read, int offset) {
char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset];
if (base == 'N' || base == 'n') { return; }
reads.add(read);
offsets.add(offset);
likelihoods.add(refBase, base, qual);
if (qual > MIN_QSCORE) qualitySums.incrementSum(base, qual);
}
public String getLocusBases() {
StringBuilder sb = new StringBuilder();
for(int i=0; i<reads.size(); i++) {
sb.append(reads.get(i).getReadString().charAt(offsets.get(i)));
}
return sb.toString();
}
public double getAltVsRef(char altAllele) {
double[] tumorRefAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
return tumorLod;
}
public double getRefVsAlt(char altAllele) {
double[] refAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
return normalLod;
}
/**
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
*/
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
double refRef = 0;
double altRef = 0;
double altAlt = 0;
for(int j=0; j<10; j++) {
String gt = gl.genotypes[j];
double likelihood = gl.likelihoods[j];
// the ref:mutant theory
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
altRef += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
altAlt += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
refRef = likelihood;
}
}
return new double[]{refRef, altRef, altAlt};
}
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
Map<Character, Integer> midp = new HashMap<Character, Integer>();
@ -109,14 +195,9 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
List<SAMRecord> reads = context.getReads();
GenotypeLikelihoods tumorGL = new GenotypeLikelihoods();
GenotypeLikelihoods normalGL = new GenotypeLikelihoods();
LocusReadPile tumorReadPile = new LocusReadPile(ref);
LocusReadPile normalReadPile = new LocusReadPile(ref);
QualitySums tumorQualitySums = new QualitySums();
QualitySums normalQualitySums = new QualitySums();
StringBuilder tumorBases = new StringBuilder();
StringBuilder normalBases = new StringBuilder();
for ( int i = 0; i < reads.size(); i++ )
{
SAMRecord read = reads.get(i);
@ -126,7 +207,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
read.getReadUnmappedFlag() ||
read.getMappingQuality() <= 0
// || (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
|| (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
) {
continue;
}
@ -135,20 +216,20 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
String sample = read.getHeader().getReadGroup(rg).getSample();
int offset = context.getOffsets().get(i);
char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset];
if (base == 'N' || base == 'n') { continue; }
// TODO: build a pile of reads and offsets, then pass that into a
// constructor for the normalGL class
// that way, we can build a different pile of reads later on and extract the genotype
if (normalSampleName.equals(sample)) {
normalGL.add(ref, base, qual);
if (qual > MIN_QSCORE) normalQualitySums.incrementSum(base, qual);
normalBases.append(base);
normalReadPile.add(read, offset);
} else if (tumorSampleName.equals(sample)) {
tumorGL.add(ref, base, qual);
if (qual > MIN_QSCORE) tumorQualitySums.incrementSum(base, qual);
tumorBases.append(base);
tumorReadPile.add(read, offset);
int midDist = Math.abs((int)(read.getReadLength() / 2) - offset);
if (midDist < midp.get(base)) { midp.put(base, midDist); }
@ -159,7 +240,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
}
// pretest: if the sum of the quality scores for all non-ref alleles < 60, just quit looking now
if (tumorQualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
if (tumorReadPile.qualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
return -1;
}
@ -170,13 +251,18 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
if (altAllele == upRef) { continue; }
double[] tumorRefAlt = extractRefAlt(tumorGL, ref, altAllele);
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
// be at least 6.3;
if (tumorQualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
int mutantSum = tumorReadPile.qualitySums.get(altAllele);
int refSum = tumorReadPile.qualitySums.get(ref);
if (tumorReadPile.getAltVsRef(altAllele) >= TUMOR_LOD
// ||
// (mutantSum >= MIN_MUTANT_SUM && (float)mutantSum / (float) refSum >= 0.05f)
) {
// yep -- just fall through... easier to think about this way!
} else {
continue;
}
@ -187,12 +273,64 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
// return 0;
// }
double[] refAlt = extractRefAlt(normalGL, ref, altAllele);
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
// do a MSA to figure out if the alternate reads comes from a cluster of reads with more
// than one alternate allele (hints at being an alignment artifact)
ReferenceSequence refSeq;
// TODO: don't hardcode. Make this the max read length in the pile
long refStart = context.getLocation().getStart() - 100;
//tumorReadPile.offsets.get(0);
long refStop = context.getLocation().getStart() + 100;
try {
IndexedFastaSequenceFile seqFile = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
refSeq = seqFile.getSubsequenceAt(context.getContig(),refStart, refStop);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
// System.out.println("TESTING " + context.getContig() + ":" + context.getPosition());
LocusReadPile t2 = filterHighMismatchScoreReads(tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
// TEST the LOD score again!
// TODO: extract this since we'll do it multiple times...
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
// be at least 6.3;
double tumorLod = t2.getAltVsRef(altAllele);
if (t2.qualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
if (outputFailures) {
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to MAX MM QSCORE TEST." +
" LOD was " + tumorReadPile.getAltVsRef(altAllele) +
" LOD is now " + t2.getAltVsRef(altAllele) +
" QSUM was " + tumorReadPile.qualitySums.get(altAllele) +
" QSUM is now " + t2.qualitySums.get(altAllele));
}
continue;
}
// TODO: using the original pile here since often these artifacts will be supported
// by those reads that get thrown out! Maybe that means we don't need the noise filter...
boolean shouldDisalign =
disaligner(context.getPosition(), tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
if (shouldDisalign) {
if (outputFailures) {
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to DISALIGNMENT TEST.");
}
continue;
}
// (ii) the quality score sum for the mutant base in the normal must be < 50 and the
// LOD score for ref:ref vs mutant:ref + mutant:mutant must be at least 2.3.
if ( normalQualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
double normalLod = normalReadPile.getRefVsAlt(altAllele);
if ( normalReadPile.qualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
continue;
}
@ -204,11 +342,11 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
context.getPosition() + "\t" +
context.getPosition() + "\t" +
"TScore:" + tumorLod +
"__TRefSum: " + tumorQualitySums.get(ref) +
"__TAltSum: " + tumorQualitySums.get(altAllele) +
"__TRefSum: " + tumorReadPile.qualitySums.get(ref) +
"__TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
"__NScore:" + normalLod +
"__NRefSum: " + normalQualitySums.get(ref) +
"__NAltSum: " + normalQualitySums.get(altAllele) +
"__NRefSum: " + normalReadPile.qualitySums.get(ref) +
"__NAltSum: " + normalReadPile.qualitySums.get(altAllele) +
"__MIDP: " + midp.get(altAllele) +
(failedMidpointCheck?"__FAILED-MPCHECK":"")
);
@ -216,13 +354,13 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
} else {
out.println(context.getLocation() + " " + upRef + " " + altAllele +
" TScore:" + tumorLod +
" TRefSum: " + tumorQualitySums.get(ref) +
" TAltSum: " + tumorQualitySums.get(altAllele) +
" TRefSum: " + tumorReadPile.qualitySums.get(ref) +
" TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
" NScore:" + normalLod +
" NRefSum: " + normalQualitySums.get(ref) +
" NAltSum: " + normalQualitySums.get(altAllele) + " " +
tumorBases.toString() + " " +
normalBases.toString()
" NRefSum: " + normalReadPile.qualitySums.get(ref) +
" NAltSum: " + normalReadPile.qualitySums.get(altAllele) + " " +
tumorReadPile.getLocusBases().toString() + " " +
normalReadPile.getLocusBases().toString()
);
}
@ -235,34 +373,6 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
return -1;
}
/**
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
*/
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
double refRef = 0;
double altRef = 0;
double altAlt = 0;
for(int j=0; j<10; j++) {
String gt = gl.genotypes[j];
double likelihood = gl.likelihoods[j];
// the ref:mutant theory
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
altRef += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
altAlt += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
refRef = likelihood;
}
}
return new double[]{refRef, altRef, altAlt};
}
// Given result of map function
public Integer reduceInit() {
@ -279,4 +389,469 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
int MAX_READ_MISMATCH_QUALITY_SCORE_SUM = 100;
private LocusReadPile filterHighMismatchScoreReads(LocusReadPile pile, String reference, long leftmostIndex) {
LocusReadPile newPile = new LocusReadPile(pile.refBase);
for ( int i=0; i< pile.reads.size(); i++) {
SAMRecord read = pile.reads.get(i);
int offset = pile.offsets.get(i);
AlignedRead aRead = new AlignedRead(read);
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
if (mismatchScore <= MAX_READ_MISMATCH_QUALITY_SCORE_SUM) {
newPile.add(read, offset);
} else {
char base = read.getReadString().charAt(offset);
// TODO: use a logger to output this information for debugging/Picard folks
//System.out.println("MMQSUM: Filtering out " + read.getReadName() + " with locus base " + base + " and ref of " + pile.refBase);
}
}
return newPile;
}
// also pass in the variant position
private boolean disaligner(long mutationLocus, LocusReadPile pile, String reference, long leftmostIndex) {
int VARIANT_SITE_MIN_QSCORE = 40; // ie 2x20
// build a 2d array of read # vs position
// Map<Long, Map<Character, List<Mismatch>>> matrix = new TreeMap<Long, Map<Character, List<Mismatch>>>();
int mutationLocusOffset = (int)(mutationLocus - leftmostIndex);
int[][] mmm = new int[pile.reads.size()][reference.length()];
// for every read which has an offset
for ( int i=0; i< pile.reads.size(); i++) {
SAMRecord read = pile.reads.get(i);
int offset = pile.offsets.get(i);
AlignedRead aRead = new AlignedRead(read);
List<Mismatch> mismatches = getMismatches(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
// now fill in with -1 where we had reference
// TODO: deal with other CIGAR strings?
for(long j=read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++) {
mmm[i][(int)( j - leftmostIndex )] = -1;
}
for(Mismatch mm : mismatches) {
if (mm.mismatchBase == 'N') { continue; }
mmm[i][(int)mm.position] = mm.qualityScore;
}
}
// traverse in column-order
int[] altSums = new int[mmm[0].length];
for(int j=0; j<altSums.length; j++) {
for(int i=0; i<mmm.length; i++) {
if (mmm[i][j] > 0) altSums[j] += mmm[i][j];
}
}
// now check that we have at least 3 alignment artifacts, and that
// they are at least 5bp from eachother
// FIXME: what should these numbers be?
int varSites = 0;
int lastVarSite = -1;
for(int j=0; j<altSums.length; j++) {
if (altSums[j] >= VARIANT_SITE_MIN_QSCORE) {
if (lastVarSite > -1) {
if (j - lastVarSite > 5) {
varSites++;
lastVarSite = j;
}
}
}
}
if (varSites < 3) { return false; }
// now go through each read which has the alternate allele at the
// current site (the potential somatic mutation) and see what fraction of
// the "covered" variant bases were called as variant
Map<Integer, Float> fractionVariantSites = new HashMap<Integer, Float>();
for(int i=0; i<mmm.length; i++) {
// does this read have the variant allele?
if (mmm[i][mutationLocusOffset] > 0) {
float calledSites = 0;
float variantSites = 0;
for(int j=0; j<altSums.length; j++) {
if (altSums[j] > VARIANT_SITE_MIN_QSCORE && j != mutationLocusOffset) {
if (mmm[i][j] != 0) { calledSites++; }
if (mmm[i][j] > 0) { variantSites++; }
}
}
fractionVariantSites.put(i, variantSites / calledSites);
}
}
// FIXME: what's the right statistic to use here?
if (getMedian(fractionVariantSites.values()) >= 0.75) {
return true;
}
return false;
}
protected float getMedian(Collection<Float> in) {
float[] data = new float[in.size()];
int i=0;
for(float f : in) { data[i++] = f; }
Arrays.sort(data);
int middle = data.length/2; // subscript of middle element
if (data.length%2 == 1) {
return data[middle];
} else {
// Even number -- return average of middle two
// Must cast the numbers to double before dividing.
return (data[middle-1] + data[middle]) / 2.0f;
}
}
private void clean(LocusReadPile pile, String reference, long leftmostIndex) {
// LocusReadPile newPile = new LocusReadPile(pile.refBase);
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
int totalMismatchSum = 0;
int MIN_MISMATCH_QSCORE = 10;
// decide which reads potentially need to be cleaned
for ( int i=0; i< pile.reads.size(); i++) {
SAMRecord read = pile.reads.get(i);
int offset = pile.offsets.get(i);
AlignedRead aRead = new AlignedRead(read);
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, MIN_MISMATCH_QSCORE);
// if this doesn't match perfectly to the reference, let's try to clean it
if ( mismatchScore > 0 ) {
if ( mismatchScore > 100) {
System.out.println(read.getReadName() + " has a sum of mismatch quality scores of " + mismatchScore);
} else {
altReads.add(aRead);
altAlignmentsToTest.add(true);
totalMismatchSum += mismatchScore;
aRead.setMismatchScoreToReference(mismatchScore);
}
}
// otherwise, we can emit it as is
else {
refReads.add(read);
}
}
// build a map of position ->
Consensus bestConsensus = null;
// for each alternative consensus to test, align it to the reference and create an alternative consensus
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
if ( altAlignmentsToTest.get(index) ) {
// do a pairwise alignment against the reference
AlignedRead aRead = altReads.get(index);
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
int refIdx = swConsensus.getAlignmentStart2wrt1();
if ( refIdx < 0 )
continue;
// create the new consensus
StringBuffer sb = new StringBuffer();
sb.append(reference.substring(0, refIdx));
Cigar c = swConsensus.getCigar();
logger.debug("CIGAR = " + cigarToString(c));
int indelCount = 0;
int altIdx = 0;
boolean ok_flag = true;
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator() ) {
case D:
indelCount++;
refIdx += ce.getLength();
break;
case M:
if ( reference.length() < refIdx+ce.getLength() )
ok_flag = false;
else
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
refIdx += ce.getLength();
altIdx += ce.getLength();
break;
case I:
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
altIdx += ce.getLength();
indelCount++;
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
continue;
sb.append(reference.substring(refIdx));
String altConsensus = sb.toString();
// for each imperfect match to the reference, score it against this alternative
Consensus consensus = new Consensus(altConsensus, c, swConsensus.getAlignmentStart2wrt1());
for ( int j = 0; j < altReads.size(); j++ ) {
AlignedRead toTest = altReads.get(j);
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.getSecond();
if ( myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus
else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.getFirst()));
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.getFirst());
consensus.mismatchSum += myScore;
if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later
altAlignmentsToTest.set(j, false);
}
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
}
}
}
System.out.println("Done with MSA");
// // if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
// if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) {
// logger.debug("CLEAN: " + bestConsensus.str );
// if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
// StringBuffer str = new StringBuffer();
// str.append(reads.get(0).getReferenceName());
// int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength();
// str.append(":" + (leftmostIndex + position));
// CigarElement ce = bestConsensus.cigar.getCigarElement(1);
// str.append("\t" + ce.getLength() + ce.getOperator());
// str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
// try {
// indelOutput.write(str.toString());
// indelOutput.flush();
// } catch (Exception e) {}
// }
//
// // We need to update the mapping quality score of the cleaned reads;
// // however we don't have enough info to use the proper MAQ scoring system.
// // For now, we'll use a heuristic:
// // the mapping quality score is improved by the LOD difference in mismatching
// // bases between the reference and alternate consensus
// int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10;
//
// // clean the appropriate reads
// for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
// AlignedRead aRead = altReads.get(indexPair.getFirst());
// updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
// aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255));
// aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
// }
// }
//
// // write them out
// for ( SAMRecord rec : refReads )
// readsToWrite.add(new ComparableSAMRecord(rec));
// for ( AlignedRead aRec : altReads )
// readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
}
// ----------------------------------- PRIVATE IN IntervalCleanerWalker
public static final int MAX_QUAL = 99;
private static class Mismatch {
AlignedRead aRead;
int offset;
long position;
char mismatchBase;
int qualityScore;
private Mismatch(AlignedRead aRead, int offset, long position, char mismatchBase, int qualityScore) {
this.aRead = aRead;
this.offset = offset;
this.position = position;
this.mismatchBase = mismatchBase;
this.qualityScore = qualityScore;
}
}
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex) {
return mismatchQualitySum(aRead, refSeq, refIndex, 0);
}
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex, int minMismatchQualityScore) {
List<Mismatch> mismatches = getMismatches(aRead, refSeq, refIndex);
int sum = 0;
for(Mismatch mm : mismatches) {
if (mm.qualityScore >= minMismatchQualityScore) {
sum += mm.qualityScore;
}
}
return sum;
}
/**
* Returns a list of position
* @param aRead
* @param refSeq
* @param refIndex
* @param minMismatchQualityScore
* @return
*/
private static List<Mismatch> getMismatches(AlignedRead aRead, String refSeq, int refIndex) {
List<Mismatch> mismatches = new ArrayList<Mismatch>();
String readSeq = aRead.getReadString();
String quals = aRead.getBaseQualityString();
int readIndex = 0;
int sum = 0;
Cigar c = aRead.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
switch ( ce.getOperator() ) {
case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
// FIXME: what is this case????
if ( refIndex >= refSeq.length() ) {
sum += MAX_QUAL;
} else {
char readBase = Character.toUpperCase(readSeq.charAt(readIndex));
char refBase = Character.toUpperCase(refSeq.charAt(refIndex));
if ( readBase != refBase ) {
int qual = (int)quals.charAt(readIndex) - 33;
mismatches.add(new Mismatch(aRead, readIndex, refIndex, readBase, qual));
}
}
}
break;
case I:
readIndex += ce.getLength();
break;
case D:
refIndex += ce.getLength();
break;
}
}
return mismatches;
}
private Pair<Integer, Integer> findBestOffset(String ref, AlignedRead read) {
int attempts = ref.length() - read.getReadLength() + 1;
int bestScore = mismatchQualitySum(read, ref, 0);
int bestIndex = 0;
for ( int i = 1; i < attempts; i++ ) {
// we can't get better than 0!
if ( bestScore == 0 )
return new Pair<Integer, Integer>(bestIndex, 0);
int score = mismatchQualitySum(read, ref, i);
if ( score < bestScore ) {
bestScore = score;
bestIndex = i;
}
}
return new Pair<Integer, Integer>(bestIndex, bestScore);
}
private class AlignedRead {
SAMRecord read;
int mismatchScoreToReference;
public AlignedRead(SAMRecord read) {
this.read = read;
mismatchScoreToReference = 0;
}
public SAMRecord getRead() {
return read;
}
public String getReadString() {
return read.getReadString();
}
public int getReadLength() {
return read.getReadLength();
}
public Cigar getCigar() {
return read.getCigar();
}
public void setCigar(Cigar cigar) {
read.setCigar(cigar);
}
public String getBaseQualityString() {
return read.getBaseQualityString();
}
public void setMismatchScoreToReference(int score) {
mismatchScoreToReference = score;
}
public int getMismatchScoreToReference() {
return mismatchScoreToReference;
}
}
private class Consensus {
public String str;
public int mismatchSum;
public int positionOnReference;
public Cigar cigar;
public ArrayList<Pair<Integer, Integer>> readIndexes;
public Consensus(String str, Cigar cigar, int positionOnReference) {
this.str = str;
this.cigar = cigar;
this.positionOnReference = positionOnReference;
mismatchSum = 0;
readIndexes = new ArrayList<Pair<Integer, Integer>>();
}
}
public static String cigarToString(Cigar cig) {
StringBuilder b = new StringBuilder();
for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) {
char c='?';
switch ( cig.getCigarElement(i).getOperator() ) {
case M : c = 'M'; break;
case D : c = 'D'; break;
case I : c = 'I'; break;
}
b.append(cig.getCigarElement(i).getLength());
b.append(c);
}
return b.toString();
}
}