basic clustering of reads to reduce artifacts
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@873 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e712d69382
commit
c4cb867d74
|
|
@ -1,20 +1,25 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import net.sf.samtools.Cigar;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.util.StringUtil;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.By;
|
import org.broadinstitute.sting.gatk.walkers.By;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.playground.indels.SWPairwiseAlignment;
|
||||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||||
import net.sf.samtools.SAMRecord;
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
@By(DataSource.REFERENCE)
|
@By(DataSource.REFERENCE)
|
||||||
public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
@ -78,10 +83,91 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
public static int MIN_QSCORE = 13;
|
public static int MIN_QSCORE = 13;
|
||||||
|
|
||||||
public static double TUMOR_LOD = 6.3d;
|
// public static double TUMOR_LOD = 6.3d;
|
||||||
// public static double TUMOR_LOD = 1.0d;
|
public static double TUMOR_LOD = 4.0d;
|
||||||
public static double NORMAL_LOD = 2.3d;
|
public static double NORMAL_LOD = 2.3d;
|
||||||
|
|
||||||
|
private String cachedContigName;
|
||||||
|
private byte[] cachedContig;
|
||||||
|
|
||||||
|
private boolean outputFailures = true;
|
||||||
|
|
||||||
|
private static class LocusReadPile {
|
||||||
|
private char refBase;
|
||||||
|
public List<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
|
public List<Integer> offsets = new ArrayList<Integer>();
|
||||||
|
public GenotypeLikelihoods likelihoods = new GenotypeLikelihoods();
|
||||||
|
public QualitySums qualitySums = new QualitySums();
|
||||||
|
|
||||||
|
public LocusReadPile(char refBase) {
|
||||||
|
this.refBase = refBase;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(SAMRecord read, int offset) {
|
||||||
|
char base = read.getReadString().charAt(offset);
|
||||||
|
byte qual = read.getBaseQualities()[offset];
|
||||||
|
|
||||||
|
if (base == 'N' || base == 'n') { return; }
|
||||||
|
|
||||||
|
reads.add(read);
|
||||||
|
offsets.add(offset);
|
||||||
|
|
||||||
|
|
||||||
|
likelihoods.add(refBase, base, qual);
|
||||||
|
if (qual > MIN_QSCORE) qualitySums.incrementSum(base, qual);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLocusBases() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for(int i=0; i<reads.size(); i++) {
|
||||||
|
sb.append(reads.get(i).getReadString().charAt(offsets.get(i)));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getAltVsRef(char altAllele) {
|
||||||
|
double[] tumorRefAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
|
||||||
|
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
|
||||||
|
return tumorLod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getRefVsAlt(char altAllele) {
|
||||||
|
double[] refAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
|
||||||
|
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
|
||||||
|
return normalLod;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
|
||||||
|
*/
|
||||||
|
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
|
||||||
|
double refRef = 0;
|
||||||
|
double altRef = 0;
|
||||||
|
double altAlt = 0;
|
||||||
|
for(int j=0; j<10; j++) {
|
||||||
|
String gt = gl.genotypes[j];
|
||||||
|
double likelihood = gl.likelihoods[j];
|
||||||
|
|
||||||
|
// the ref:mutant theory
|
||||||
|
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
|
||||||
|
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
|
||||||
|
altRef += Math.pow(10, likelihood);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
|
||||||
|
altAlt += Math.pow(10, likelihood);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
|
||||||
|
refRef = likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return new double[]{refRef, altRef, altAlt};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
Map<Character, Integer> midp = new HashMap<Character, Integer>();
|
Map<Character, Integer> midp = new HashMap<Character, Integer>();
|
||||||
|
|
@ -109,14 +195,9 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
List<SAMRecord> reads = context.getReads();
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
|
||||||
GenotypeLikelihoods tumorGL = new GenotypeLikelihoods();
|
LocusReadPile tumorReadPile = new LocusReadPile(ref);
|
||||||
GenotypeLikelihoods normalGL = new GenotypeLikelihoods();
|
LocusReadPile normalReadPile = new LocusReadPile(ref);
|
||||||
|
|
||||||
QualitySums tumorQualitySums = new QualitySums();
|
|
||||||
QualitySums normalQualitySums = new QualitySums();
|
|
||||||
|
|
||||||
StringBuilder tumorBases = new StringBuilder();
|
|
||||||
StringBuilder normalBases = new StringBuilder();
|
|
||||||
for ( int i = 0; i < reads.size(); i++ )
|
for ( int i = 0; i < reads.size(); i++ )
|
||||||
{
|
{
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
|
|
@ -126,7 +207,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
read.getReadUnmappedFlag() ||
|
read.getReadUnmappedFlag() ||
|
||||||
read.getMappingQuality() <= 0
|
read.getMappingQuality() <= 0
|
||||||
|
|
||||||
// || (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
|
|| (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -135,20 +216,20 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
String sample = read.getHeader().getReadGroup(rg).getSample();
|
String sample = read.getHeader().getReadGroup(rg).getSample();
|
||||||
|
|
||||||
int offset = context.getOffsets().get(i);
|
int offset = context.getOffsets().get(i);
|
||||||
|
|
||||||
char base = read.getReadString().charAt(offset);
|
char base = read.getReadString().charAt(offset);
|
||||||
byte qual = read.getBaseQualities()[offset];
|
|
||||||
|
|
||||||
|
|
||||||
if (base == 'N' || base == 'n') { continue; }
|
if (base == 'N' || base == 'n') { continue; }
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: build a pile of reads and offsets, then pass that into a
|
||||||
|
// constructor for the normalGL class
|
||||||
|
// that way, we can build a different pile of reads later on and extract the genotype
|
||||||
if (normalSampleName.equals(sample)) {
|
if (normalSampleName.equals(sample)) {
|
||||||
normalGL.add(ref, base, qual);
|
normalReadPile.add(read, offset);
|
||||||
if (qual > MIN_QSCORE) normalQualitySums.incrementSum(base, qual);
|
|
||||||
normalBases.append(base);
|
|
||||||
} else if (tumorSampleName.equals(sample)) {
|
} else if (tumorSampleName.equals(sample)) {
|
||||||
tumorGL.add(ref, base, qual);
|
tumorReadPile.add(read, offset);
|
||||||
if (qual > MIN_QSCORE) tumorQualitySums.incrementSum(base, qual);
|
|
||||||
tumorBases.append(base);
|
|
||||||
|
|
||||||
int midDist = Math.abs((int)(read.getReadLength() / 2) - offset);
|
int midDist = Math.abs((int)(read.getReadLength() / 2) - offset);
|
||||||
if (midDist < midp.get(base)) { midp.put(base, midDist); }
|
if (midDist < midp.get(base)) { midp.put(base, midDist); }
|
||||||
|
|
@ -159,7 +240,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// pretest: if the sum of the quality scores for all non-ref alleles < 60, just quit looking now
|
// pretest: if the sum of the quality scores for all non-ref alleles < 60, just quit looking now
|
||||||
if (tumorQualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
|
if (tumorReadPile.qualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -170,13 +251,18 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
if (altAllele == upRef) { continue; }
|
if (altAllele == upRef) { continue; }
|
||||||
|
|
||||||
|
|
||||||
double[] tumorRefAlt = extractRefAlt(tumorGL, ref, altAllele);
|
|
||||||
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
|
|
||||||
|
|
||||||
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
|
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
|
||||||
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
|
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
|
||||||
// be at least 6.3;
|
// be at least 6.3;
|
||||||
if (tumorQualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
|
int mutantSum = tumorReadPile.qualitySums.get(altAllele);
|
||||||
|
int refSum = tumorReadPile.qualitySums.get(ref);
|
||||||
|
|
||||||
|
if (tumorReadPile.getAltVsRef(altAllele) >= TUMOR_LOD
|
||||||
|
// ||
|
||||||
|
// (mutantSum >= MIN_MUTANT_SUM && (float)mutantSum / (float) refSum >= 0.05f)
|
||||||
|
) {
|
||||||
|
// yep -- just fall through... easier to think about this way!
|
||||||
|
} else {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -187,12 +273,64 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
// return 0;
|
// return 0;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
double[] refAlt = extractRefAlt(normalGL, ref, altAllele);
|
|
||||||
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
|
// do a MSA to figure out if the alternate reads comes from a cluster of reads with more
|
||||||
|
// than one alternate allele (hints at being an alignment artifact)
|
||||||
|
ReferenceSequence refSeq;
|
||||||
|
// TODO: don't hardcode. Make this the max read length in the pile
|
||||||
|
long refStart = context.getLocation().getStart() - 100;
|
||||||
|
//tumorReadPile.offsets.get(0);
|
||||||
|
long refStop = context.getLocation().getStart() + 100;
|
||||||
|
try {
|
||||||
|
IndexedFastaSequenceFile seqFile = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||||
|
refSeq = seqFile.getSubsequenceAt(context.getContig(),refStart, refStop);
|
||||||
|
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new RuntimeException(ioe);
|
||||||
|
}
|
||||||
|
|
||||||
|
// System.out.println("TESTING " + context.getContig() + ":" + context.getPosition());
|
||||||
|
LocusReadPile t2 = filterHighMismatchScoreReads(tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
|
||||||
|
|
||||||
|
// TEST the LOD score again!
|
||||||
|
// TODO: extract this since we'll do it multiple times...
|
||||||
|
|
||||||
|
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
|
||||||
|
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
|
||||||
|
// be at least 6.3;
|
||||||
|
double tumorLod = t2.getAltVsRef(altAllele);
|
||||||
|
if (t2.qualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
|
||||||
|
if (outputFailures) {
|
||||||
|
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to MAX MM QSCORE TEST." +
|
||||||
|
" LOD was " + tumorReadPile.getAltVsRef(altAllele) +
|
||||||
|
" LOD is now " + t2.getAltVsRef(altAllele) +
|
||||||
|
" QSUM was " + tumorReadPile.qualitySums.get(altAllele) +
|
||||||
|
" QSUM is now " + t2.qualitySums.get(altAllele));
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: using the original pile here since often these artifacts will be supported
|
||||||
|
// by those reads that get thrown out! Maybe that means we don't need the noise filter...
|
||||||
|
boolean shouldDisalign =
|
||||||
|
disaligner(context.getPosition(), tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
|
||||||
|
|
||||||
|
if (shouldDisalign) {
|
||||||
|
if (outputFailures) {
|
||||||
|
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to DISALIGNMENT TEST.");
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// (ii) the quality score sum for the mutant base in the normal must be < 50 and the
|
// (ii) the quality score sum for the mutant base in the normal must be < 50 and the
|
||||||
// LOD score for ref:ref vs mutant:ref + mutant:mutant must be at least 2.3.
|
// LOD score for ref:ref vs mutant:ref + mutant:mutant must be at least 2.3.
|
||||||
if ( normalQualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
|
double normalLod = normalReadPile.getRefVsAlt(altAllele);
|
||||||
|
if ( normalReadPile.qualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -204,11 +342,11 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
context.getPosition() + "\t" +
|
context.getPosition() + "\t" +
|
||||||
context.getPosition() + "\t" +
|
context.getPosition() + "\t" +
|
||||||
"TScore:" + tumorLod +
|
"TScore:" + tumorLod +
|
||||||
"__TRefSum: " + tumorQualitySums.get(ref) +
|
"__TRefSum: " + tumorReadPile.qualitySums.get(ref) +
|
||||||
"__TAltSum: " + tumorQualitySums.get(altAllele) +
|
"__TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
|
||||||
"__NScore:" + normalLod +
|
"__NScore:" + normalLod +
|
||||||
"__NRefSum: " + normalQualitySums.get(ref) +
|
"__NRefSum: " + normalReadPile.qualitySums.get(ref) +
|
||||||
"__NAltSum: " + normalQualitySums.get(altAllele) +
|
"__NAltSum: " + normalReadPile.qualitySums.get(altAllele) +
|
||||||
"__MIDP: " + midp.get(altAllele) +
|
"__MIDP: " + midp.get(altAllele) +
|
||||||
(failedMidpointCheck?"__FAILED-MPCHECK":"")
|
(failedMidpointCheck?"__FAILED-MPCHECK":"")
|
||||||
);
|
);
|
||||||
|
|
@ -216,13 +354,13 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
} else {
|
} else {
|
||||||
out.println(context.getLocation() + " " + upRef + " " + altAllele +
|
out.println(context.getLocation() + " " + upRef + " " + altAllele +
|
||||||
" TScore:" + tumorLod +
|
" TScore:" + tumorLod +
|
||||||
" TRefSum: " + tumorQualitySums.get(ref) +
|
" TRefSum: " + tumorReadPile.qualitySums.get(ref) +
|
||||||
" TAltSum: " + tumorQualitySums.get(altAllele) +
|
" TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
|
||||||
" NScore:" + normalLod +
|
" NScore:" + normalLod +
|
||||||
" NRefSum: " + normalQualitySums.get(ref) +
|
" NRefSum: " + normalReadPile.qualitySums.get(ref) +
|
||||||
" NAltSum: " + normalQualitySums.get(altAllele) + " " +
|
" NAltSum: " + normalReadPile.qualitySums.get(altAllele) + " " +
|
||||||
tumorBases.toString() + " " +
|
tumorReadPile.getLocusBases().toString() + " " +
|
||||||
normalBases.toString()
|
normalReadPile.getLocusBases().toString()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -235,34 +373,6 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
|
|
||||||
*/
|
|
||||||
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
|
|
||||||
double refRef = 0;
|
|
||||||
double altRef = 0;
|
|
||||||
double altAlt = 0;
|
|
||||||
for(int j=0; j<10; j++) {
|
|
||||||
String gt = gl.genotypes[j];
|
|
||||||
double likelihood = gl.likelihoods[j];
|
|
||||||
|
|
||||||
// the ref:mutant theory
|
|
||||||
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
|
|
||||||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
|
|
||||||
altRef += Math.pow(10, likelihood);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
|
|
||||||
altAlt += Math.pow(10, likelihood);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
|
|
||||||
refRef = likelihood;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return new double[]{refRef, altRef, altAlt};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given result of map function
|
// Given result of map function
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -279,4 +389,469 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int MAX_READ_MISMATCH_QUALITY_SCORE_SUM = 100;
|
||||||
|
private LocusReadPile filterHighMismatchScoreReads(LocusReadPile pile, String reference, long leftmostIndex) {
|
||||||
|
LocusReadPile newPile = new LocusReadPile(pile.refBase);
|
||||||
|
|
||||||
|
for ( int i=0; i< pile.reads.size(); i++) {
|
||||||
|
SAMRecord read = pile.reads.get(i);
|
||||||
|
int offset = pile.offsets.get(i);
|
||||||
|
|
||||||
|
AlignedRead aRead = new AlignedRead(read);
|
||||||
|
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
|
||||||
|
|
||||||
|
if (mismatchScore <= MAX_READ_MISMATCH_QUALITY_SCORE_SUM) {
|
||||||
|
newPile.add(read, offset);
|
||||||
|
} else {
|
||||||
|
char base = read.getReadString().charAt(offset);
|
||||||
|
|
||||||
|
// TODO: use a logger to output this information for debugging/Picard folks
|
||||||
|
//System.out.println("MMQSUM: Filtering out " + read.getReadName() + " with locus base " + base + " and ref of " + pile.refBase);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newPile;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// also pass in the variant position
|
||||||
|
private boolean disaligner(long mutationLocus, LocusReadPile pile, String reference, long leftmostIndex) {
|
||||||
|
int VARIANT_SITE_MIN_QSCORE = 40; // ie 2x20
|
||||||
|
// build a 2d array of read # vs position
|
||||||
|
// Map<Long, Map<Character, List<Mismatch>>> matrix = new TreeMap<Long, Map<Character, List<Mismatch>>>();
|
||||||
|
|
||||||
|
int mutationLocusOffset = (int)(mutationLocus - leftmostIndex);
|
||||||
|
int[][] mmm = new int[pile.reads.size()][reference.length()];
|
||||||
|
|
||||||
|
// for every read which has an offset
|
||||||
|
|
||||||
|
|
||||||
|
for ( int i=0; i< pile.reads.size(); i++) {
|
||||||
|
SAMRecord read = pile.reads.get(i);
|
||||||
|
int offset = pile.offsets.get(i);
|
||||||
|
|
||||||
|
AlignedRead aRead = new AlignedRead(read);
|
||||||
|
List<Mismatch> mismatches = getMismatches(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
|
||||||
|
|
||||||
|
// now fill in with -1 where we had reference
|
||||||
|
// TODO: deal with other CIGAR strings?
|
||||||
|
for(long j=read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++) {
|
||||||
|
mmm[i][(int)( j - leftmostIndex )] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for(Mismatch mm : mismatches) {
|
||||||
|
if (mm.mismatchBase == 'N') { continue; }
|
||||||
|
|
||||||
|
mmm[i][(int)mm.position] = mm.qualityScore;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// traverse in column-order
|
||||||
|
int[] altSums = new int[mmm[0].length];
|
||||||
|
for(int j=0; j<altSums.length; j++) {
|
||||||
|
|
||||||
|
for(int i=0; i<mmm.length; i++) {
|
||||||
|
if (mmm[i][j] > 0) altSums[j] += mmm[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now check that we have at least 3 alignment artifacts, and that
|
||||||
|
// they are at least 5bp from eachother
|
||||||
|
// FIXME: what should these numbers be?
|
||||||
|
int varSites = 0;
|
||||||
|
int lastVarSite = -1;
|
||||||
|
for(int j=0; j<altSums.length; j++) {
|
||||||
|
if (altSums[j] >= VARIANT_SITE_MIN_QSCORE) {
|
||||||
|
if (lastVarSite > -1) {
|
||||||
|
if (j - lastVarSite > 5) {
|
||||||
|
varSites++;
|
||||||
|
lastVarSite = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (varSites < 3) { return false; }
|
||||||
|
|
||||||
|
// now go through each read which has the alternate allele at the
|
||||||
|
// current site (the potential somatic mutation) and see what fraction of
|
||||||
|
// the "covered" variant bases were called as variant
|
||||||
|
Map<Integer, Float> fractionVariantSites = new HashMap<Integer, Float>();
|
||||||
|
for(int i=0; i<mmm.length; i++) {
|
||||||
|
|
||||||
|
// does this read have the variant allele?
|
||||||
|
if (mmm[i][mutationLocusOffset] > 0) {
|
||||||
|
float calledSites = 0;
|
||||||
|
float variantSites = 0;
|
||||||
|
for(int j=0; j<altSums.length; j++) {
|
||||||
|
if (altSums[j] > VARIANT_SITE_MIN_QSCORE && j != mutationLocusOffset) {
|
||||||
|
if (mmm[i][j] != 0) { calledSites++; }
|
||||||
|
if (mmm[i][j] > 0) { variantSites++; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fractionVariantSites.put(i, variantSites / calledSites);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: what's the right statistic to use here?
|
||||||
|
if (getMedian(fractionVariantSites.values()) >= 0.75) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected float getMedian(Collection<Float> in) {
|
||||||
|
float[] data = new float[in.size()];
|
||||||
|
int i=0;
|
||||||
|
for(float f : in) { data[i++] = f; }
|
||||||
|
|
||||||
|
Arrays.sort(data);
|
||||||
|
|
||||||
|
int middle = data.length/2; // subscript of middle element
|
||||||
|
if (data.length%2 == 1) {
|
||||||
|
return data[middle];
|
||||||
|
} else {
|
||||||
|
// Even number -- return average of middle two
|
||||||
|
// Must cast the numbers to double before dividing.
|
||||||
|
return (data[middle-1] + data[middle]) / 2.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void clean(LocusReadPile pile, String reference, long leftmostIndex) {
|
||||||
|
// LocusReadPile newPile = new LocusReadPile(pile.refBase);
|
||||||
|
|
||||||
|
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
|
||||||
|
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
|
||||||
|
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
|
||||||
|
int totalMismatchSum = 0;
|
||||||
|
|
||||||
|
int MIN_MISMATCH_QSCORE = 10;
|
||||||
|
// decide which reads potentially need to be cleaned
|
||||||
|
for ( int i=0; i< pile.reads.size(); i++) {
|
||||||
|
SAMRecord read = pile.reads.get(i);
|
||||||
|
int offset = pile.offsets.get(i);
|
||||||
|
|
||||||
|
AlignedRead aRead = new AlignedRead(read);
|
||||||
|
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, MIN_MISMATCH_QSCORE);
|
||||||
|
|
||||||
|
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||||
|
if ( mismatchScore > 0 ) {
|
||||||
|
|
||||||
|
if ( mismatchScore > 100) {
|
||||||
|
System.out.println(read.getReadName() + " has a sum of mismatch quality scores of " + mismatchScore);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
altReads.add(aRead);
|
||||||
|
altAlignmentsToTest.add(true);
|
||||||
|
totalMismatchSum += mismatchScore;
|
||||||
|
aRead.setMismatchScoreToReference(mismatchScore);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// otherwise, we can emit it as is
|
||||||
|
else {
|
||||||
|
refReads.add(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// build a map of position ->
|
||||||
|
Consensus bestConsensus = null;
|
||||||
|
|
||||||
|
// for each alternative consensus to test, align it to the reference and create an alternative consensus
|
||||||
|
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
|
||||||
|
if ( altAlignmentsToTest.get(index) ) {
|
||||||
|
|
||||||
|
// do a pairwise alignment against the reference
|
||||||
|
AlignedRead aRead = altReads.get(index);
|
||||||
|
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
|
||||||
|
int refIdx = swConsensus.getAlignmentStart2wrt1();
|
||||||
|
if ( refIdx < 0 )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// create the new consensus
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append(reference.substring(0, refIdx));
|
||||||
|
Cigar c = swConsensus.getCigar();
|
||||||
|
logger.debug("CIGAR = " + cigarToString(c));
|
||||||
|
|
||||||
|
int indelCount = 0;
|
||||||
|
int altIdx = 0;
|
||||||
|
boolean ok_flag = true;
|
||||||
|
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||||
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
switch( ce.getOperator() ) {
|
||||||
|
case D:
|
||||||
|
indelCount++;
|
||||||
|
refIdx += ce.getLength();
|
||||||
|
break;
|
||||||
|
case M:
|
||||||
|
if ( reference.length() < refIdx+ce.getLength() )
|
||||||
|
ok_flag = false;
|
||||||
|
else
|
||||||
|
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
|
||||||
|
refIdx += ce.getLength();
|
||||||
|
altIdx += ce.getLength();
|
||||||
|
break;
|
||||||
|
case I:
|
||||||
|
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
|
||||||
|
altIdx += ce.getLength();
|
||||||
|
indelCount++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// make sure that there is at most only a single indel and it aligns appropriately!
|
||||||
|
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
sb.append(reference.substring(refIdx));
|
||||||
|
String altConsensus = sb.toString();
|
||||||
|
|
||||||
|
// for each imperfect match to the reference, score it against this alternative
|
||||||
|
Consensus consensus = new Consensus(altConsensus, c, swConsensus.getAlignmentStart2wrt1());
|
||||||
|
for ( int j = 0; j < altReads.size(); j++ ) {
|
||||||
|
AlignedRead toTest = altReads.get(j);
|
||||||
|
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
|
||||||
|
|
||||||
|
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
|
||||||
|
int myScore = altAlignment.getSecond();
|
||||||
|
if ( myScore >= toTest.getMismatchScoreToReference() )
|
||||||
|
myScore = toTest.getMismatchScoreToReference();
|
||||||
|
// keep track of reads that align better to the alternate consensus
|
||||||
|
else
|
||||||
|
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.getFirst()));
|
||||||
|
|
||||||
|
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.getFirst());
|
||||||
|
consensus.mismatchSum += myScore;
|
||||||
|
if ( myScore == 0 )
|
||||||
|
// we already know that this is its consensus, so don't bother testing it later
|
||||||
|
altAlignmentsToTest.set(j, false);
|
||||||
|
}
|
||||||
|
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
|
||||||
|
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
|
||||||
|
bestConsensus = consensus;
|
||||||
|
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Done with MSA");
|
||||||
|
// // if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
|
||||||
|
// if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) {
|
||||||
|
// logger.debug("CLEAN: " + bestConsensus.str );
|
||||||
|
// if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
|
||||||
|
// StringBuffer str = new StringBuffer();
|
||||||
|
// str.append(reads.get(0).getReferenceName());
|
||||||
|
// int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength();
|
||||||
|
// str.append(":" + (leftmostIndex + position));
|
||||||
|
// CigarElement ce = bestConsensus.cigar.getCigarElement(1);
|
||||||
|
// str.append("\t" + ce.getLength() + ce.getOperator());
|
||||||
|
// str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
||||||
|
// try {
|
||||||
|
// indelOutput.write(str.toString());
|
||||||
|
// indelOutput.flush();
|
||||||
|
// } catch (Exception e) {}
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // We need to update the mapping quality score of the cleaned reads;
|
||||||
|
// // however we don't have enough info to use the proper MAQ scoring system.
|
||||||
|
// // For now, we'll use a heuristic:
|
||||||
|
// // the mapping quality score is improved by the LOD difference in mismatching
|
||||||
|
// // bases between the reference and alternate consensus
|
||||||
|
// int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10;
|
||||||
|
//
|
||||||
|
// // clean the appropriate reads
|
||||||
|
// for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
|
||||||
|
// AlignedRead aRead = altReads.get(indexPair.getFirst());
|
||||||
|
// updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
|
||||||
|
// aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255));
|
||||||
|
// aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // write them out
|
||||||
|
// for ( SAMRecord rec : refReads )
|
||||||
|
// readsToWrite.add(new ComparableSAMRecord(rec));
|
||||||
|
// for ( AlignedRead aRec : altReads )
|
||||||
|
// readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ----------------------------------- PRIVATE IN IntervalCleanerWalker
|
||||||
|
public static final int MAX_QUAL = 99;
|
||||||
|
|
||||||
|
private static class Mismatch {
|
||||||
|
AlignedRead aRead;
|
||||||
|
int offset;
|
||||||
|
long position;
|
||||||
|
char mismatchBase;
|
||||||
|
int qualityScore;
|
||||||
|
|
||||||
|
private Mismatch(AlignedRead aRead, int offset, long position, char mismatchBase, int qualityScore) {
|
||||||
|
this.aRead = aRead;
|
||||||
|
this.offset = offset;
|
||||||
|
this.position = position;
|
||||||
|
this.mismatchBase = mismatchBase;
|
||||||
|
this.qualityScore = qualityScore;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex) {
|
||||||
|
return mismatchQualitySum(aRead, refSeq, refIndex, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex, int minMismatchQualityScore) {
|
||||||
|
List<Mismatch> mismatches = getMismatches(aRead, refSeq, refIndex);
|
||||||
|
int sum = 0;
|
||||||
|
for(Mismatch mm : mismatches) {
|
||||||
|
if (mm.qualityScore >= minMismatchQualityScore) {
|
||||||
|
sum += mm.qualityScore;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a list of position
|
||||||
|
* @param aRead
|
||||||
|
* @param refSeq
|
||||||
|
* @param refIndex
|
||||||
|
* @param minMismatchQualityScore
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private static List<Mismatch> getMismatches(AlignedRead aRead, String refSeq, int refIndex) {
|
||||||
|
List<Mismatch> mismatches = new ArrayList<Mismatch>();
|
||||||
|
|
||||||
|
String readSeq = aRead.getReadString();
|
||||||
|
String quals = aRead.getBaseQualityString();
|
||||||
|
int readIndex = 0;
|
||||||
|
int sum = 0;
|
||||||
|
Cigar c = aRead.getCigar();
|
||||||
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
switch ( ce.getOperator() ) {
|
||||||
|
case M:
|
||||||
|
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
|
||||||
|
// FIXME: what is this case????
|
||||||
|
if ( refIndex >= refSeq.length() ) {
|
||||||
|
sum += MAX_QUAL;
|
||||||
|
} else {
|
||||||
|
char readBase = Character.toUpperCase(readSeq.charAt(readIndex));
|
||||||
|
char refBase = Character.toUpperCase(refSeq.charAt(refIndex));
|
||||||
|
|
||||||
|
if ( readBase != refBase ) {
|
||||||
|
int qual = (int)quals.charAt(readIndex) - 33;
|
||||||
|
mismatches.add(new Mismatch(aRead, readIndex, refIndex, readBase, qual));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case I:
|
||||||
|
readIndex += ce.getLength();
|
||||||
|
break;
|
||||||
|
case D:
|
||||||
|
refIndex += ce.getLength();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return mismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Pair<Integer, Integer> findBestOffset(String ref, AlignedRead read) {
|
||||||
|
int attempts = ref.length() - read.getReadLength() + 1;
|
||||||
|
int bestScore = mismatchQualitySum(read, ref, 0);
|
||||||
|
int bestIndex = 0;
|
||||||
|
for ( int i = 1; i < attempts; i++ ) {
|
||||||
|
// we can't get better than 0!
|
||||||
|
if ( bestScore == 0 )
|
||||||
|
return new Pair<Integer, Integer>(bestIndex, 0);
|
||||||
|
int score = mismatchQualitySum(read, ref, i);
|
||||||
|
if ( score < bestScore ) {
|
||||||
|
bestScore = score;
|
||||||
|
bestIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new Pair<Integer, Integer>(bestIndex, bestScore);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private class AlignedRead {
|
||||||
|
SAMRecord read;
|
||||||
|
int mismatchScoreToReference;
|
||||||
|
|
||||||
|
public AlignedRead(SAMRecord read) {
|
||||||
|
this.read = read;
|
||||||
|
mismatchScoreToReference = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord getRead() {
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReadString() {
|
||||||
|
return read.getReadString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReadLength() {
|
||||||
|
return read.getReadLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Cigar getCigar() {
|
||||||
|
return read.getCigar();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCigar(Cigar cigar) {
|
||||||
|
read.setCigar(cigar);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBaseQualityString() {
|
||||||
|
return read.getBaseQualityString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMismatchScoreToReference(int score) {
|
||||||
|
mismatchScoreToReference = score;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMismatchScoreToReference() {
|
||||||
|
return mismatchScoreToReference;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class Consensus {
|
||||||
|
public String str;
|
||||||
|
public int mismatchSum;
|
||||||
|
public int positionOnReference;
|
||||||
|
public Cigar cigar;
|
||||||
|
public ArrayList<Pair<Integer, Integer>> readIndexes;
|
||||||
|
|
||||||
|
public Consensus(String str, Cigar cigar, int positionOnReference) {
|
||||||
|
this.str = str;
|
||||||
|
this.cigar = cigar;
|
||||||
|
this.positionOnReference = positionOnReference;
|
||||||
|
mismatchSum = 0;
|
||||||
|
readIndexes = new ArrayList<Pair<Integer, Integer>>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cigarToString(Cigar cig) {
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
|
|
||||||
|
for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) {
|
||||||
|
char c='?';
|
||||||
|
switch ( cig.getCigarElement(i).getOperator() ) {
|
||||||
|
case M : c = 'M'; break;
|
||||||
|
case D : c = 'D'; break;
|
||||||
|
case I : c = 'I'; break;
|
||||||
|
}
|
||||||
|
b.append(cig.getCigarElement(i).getLength());
|
||||||
|
b.append(c);
|
||||||
|
}
|
||||||
|
return b.toString();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue