basic clustering of reads to reduce artifacts
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@873 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e712d69382
commit
c4cb867d74
|
|
@ -1,20 +1,25 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.playground.indels.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
@By(DataSource.REFERENCE)
|
||||
public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
||||
|
|
@ -78,10 +83,91 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
public static int MIN_QSCORE = 13;
|
||||
|
||||
public static double TUMOR_LOD = 6.3d;
|
||||
// public static double TUMOR_LOD = 1.0d;
|
||||
// public static double TUMOR_LOD = 6.3d;
|
||||
public static double TUMOR_LOD = 4.0d;
|
||||
public static double NORMAL_LOD = 2.3d;
|
||||
|
||||
private String cachedContigName;
|
||||
private byte[] cachedContig;
|
||||
|
||||
private boolean outputFailures = true;
|
||||
|
||||
private static class LocusReadPile {
|
||||
private char refBase;
|
||||
public List<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
public List<Integer> offsets = new ArrayList<Integer>();
|
||||
public GenotypeLikelihoods likelihoods = new GenotypeLikelihoods();
|
||||
public QualitySums qualitySums = new QualitySums();
|
||||
|
||||
public LocusReadPile(char refBase) {
|
||||
this.refBase = refBase;
|
||||
}
|
||||
|
||||
public void add(SAMRecord read, int offset) {
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
|
||||
if (base == 'N' || base == 'n') { return; }
|
||||
|
||||
reads.add(read);
|
||||
offsets.add(offset);
|
||||
|
||||
|
||||
likelihoods.add(refBase, base, qual);
|
||||
if (qual > MIN_QSCORE) qualitySums.incrementSum(base, qual);
|
||||
|
||||
}
|
||||
|
||||
public String getLocusBases() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i=0; i<reads.size(); i++) {
|
||||
sb.append(reads.get(i).getReadString().charAt(offsets.get(i)));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public double getAltVsRef(char altAllele) {
|
||||
double[] tumorRefAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
|
||||
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
|
||||
return tumorLod;
|
||||
}
|
||||
|
||||
public double getRefVsAlt(char altAllele) {
|
||||
double[] refAlt = extractRefAlt(this.likelihoods, this.refBase, altAllele);
|
||||
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
|
||||
return normalLod;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
|
||||
*/
|
||||
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
|
||||
double refRef = 0;
|
||||
double altRef = 0;
|
||||
double altAlt = 0;
|
||||
for(int j=0; j<10; j++) {
|
||||
String gt = gl.genotypes[j];
|
||||
double likelihood = gl.likelihoods[j];
|
||||
|
||||
// the ref:mutant theory
|
||||
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
|
||||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
|
||||
altRef += Math.pow(10, likelihood);
|
||||
}
|
||||
|
||||
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
|
||||
altAlt += Math.pow(10, likelihood);
|
||||
}
|
||||
|
||||
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
|
||||
refRef = likelihood;
|
||||
}
|
||||
|
||||
}
|
||||
return new double[]{refRef, altRef, altAlt};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
Map<Character, Integer> midp = new HashMap<Character, Integer>();
|
||||
|
|
@ -109,14 +195,9 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
|
||||
GenotypeLikelihoods tumorGL = new GenotypeLikelihoods();
|
||||
GenotypeLikelihoods normalGL = new GenotypeLikelihoods();
|
||||
LocusReadPile tumorReadPile = new LocusReadPile(ref);
|
||||
LocusReadPile normalReadPile = new LocusReadPile(ref);
|
||||
|
||||
QualitySums tumorQualitySums = new QualitySums();
|
||||
QualitySums normalQualitySums = new QualitySums();
|
||||
|
||||
StringBuilder tumorBases = new StringBuilder();
|
||||
StringBuilder normalBases = new StringBuilder();
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
|
|
@ -126,7 +207,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
read.getReadUnmappedFlag() ||
|
||||
read.getMappingQuality() <= 0
|
||||
|
||||
// || (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
|
||||
|| (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -135,20 +216,20 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
String sample = read.getHeader().getReadGroup(rg).getSample();
|
||||
|
||||
int offset = context.getOffsets().get(i);
|
||||
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
|
||||
|
||||
if (base == 'N' || base == 'n') { continue; }
|
||||
|
||||
|
||||
|
||||
// TODO: build a pile of reads and offsets, then pass that into a
|
||||
// constructor for the normalGL class
|
||||
// that way, we can build a different pile of reads later on and extract the genotype
|
||||
if (normalSampleName.equals(sample)) {
|
||||
normalGL.add(ref, base, qual);
|
||||
if (qual > MIN_QSCORE) normalQualitySums.incrementSum(base, qual);
|
||||
normalBases.append(base);
|
||||
normalReadPile.add(read, offset);
|
||||
|
||||
} else if (tumorSampleName.equals(sample)) {
|
||||
tumorGL.add(ref, base, qual);
|
||||
if (qual > MIN_QSCORE) tumorQualitySums.incrementSum(base, qual);
|
||||
tumorBases.append(base);
|
||||
tumorReadPile.add(read, offset);
|
||||
|
||||
|
||||
int midDist = Math.abs((int)(read.getReadLength() / 2) - offset);
|
||||
if (midDist < midp.get(base)) { midp.put(base, midDist); }
|
||||
|
|
@ -159,7 +240,7 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
// pretest: if the sum of the quality scores for all non-ref alleles < 60, just quit looking now
|
||||
if (tumorQualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
|
||||
if (tumorReadPile.qualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
@ -170,13 +251,18 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
if (altAllele == upRef) { continue; }
|
||||
|
||||
|
||||
double[] tumorRefAlt = extractRefAlt(tumorGL, ref, altAllele);
|
||||
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
|
||||
|
||||
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
|
||||
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
|
||||
// be at least 6.3;
|
||||
if (tumorQualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
|
||||
int mutantSum = tumorReadPile.qualitySums.get(altAllele);
|
||||
int refSum = tumorReadPile.qualitySums.get(ref);
|
||||
|
||||
if (tumorReadPile.getAltVsRef(altAllele) >= TUMOR_LOD
|
||||
// ||
|
||||
// (mutantSum >= MIN_MUTANT_SUM && (float)mutantSum / (float) refSum >= 0.05f)
|
||||
) {
|
||||
// yep -- just fall through... easier to think about this way!
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -187,12 +273,64 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
// return 0;
|
||||
// }
|
||||
|
||||
double[] refAlt = extractRefAlt(normalGL, ref, altAllele);
|
||||
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
|
||||
|
||||
// do a MSA to figure out if the alternate reads comes from a cluster of reads with more
|
||||
// than one alternate allele (hints at being an alignment artifact)
|
||||
ReferenceSequence refSeq;
|
||||
// TODO: don't hardcode. Make this the max read length in the pile
|
||||
long refStart = context.getLocation().getStart() - 100;
|
||||
//tumorReadPile.offsets.get(0);
|
||||
long refStop = context.getLocation().getStart() + 100;
|
||||
try {
|
||||
IndexedFastaSequenceFile seqFile = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
refSeq = seqFile.getSubsequenceAt(context.getContig(),refStart, refStop);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
|
||||
// System.out.println("TESTING " + context.getContig() + ":" + context.getPosition());
|
||||
LocusReadPile t2 = filterHighMismatchScoreReads(tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
|
||||
|
||||
// TEST the LOD score again!
|
||||
// TODO: extract this since we'll do it multiple times...
|
||||
|
||||
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
|
||||
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
|
||||
// be at least 6.3;
|
||||
double tumorLod = t2.getAltVsRef(altAllele);
|
||||
if (t2.qualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
|
||||
if (outputFailures) {
|
||||
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to MAX MM QSCORE TEST." +
|
||||
" LOD was " + tumorReadPile.getAltVsRef(altAllele) +
|
||||
" LOD is now " + t2.getAltVsRef(altAllele) +
|
||||
" QSUM was " + tumorReadPile.qualitySums.get(altAllele) +
|
||||
" QSUM is now " + t2.qualitySums.get(altAllele));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: using the original pile here since often these artifacts will be supported
|
||||
// by those reads that get thrown out! Maybe that means we don't need the noise filter...
|
||||
boolean shouldDisalign =
|
||||
disaligner(context.getPosition(), tumorReadPile, StringUtil.bytesToString(refSeq.getBases()), refStart);
|
||||
|
||||
if (shouldDisalign) {
|
||||
if (outputFailures) {
|
||||
System.out.println("FAILED " + context.getContig() + ":" + context.getPosition() + " due to DISALIGNMENT TEST.");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// (ii) the quality score sum for the mutant base in the normal must be < 50 and the
|
||||
// LOD score for ref:ref vs mutant:ref + mutant:mutant must be at least 2.3.
|
||||
if ( normalQualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
|
||||
double normalLod = normalReadPile.getRefVsAlt(altAllele);
|
||||
if ( normalReadPile.qualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -204,11 +342,11 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
context.getPosition() + "\t" +
|
||||
context.getPosition() + "\t" +
|
||||
"TScore:" + tumorLod +
|
||||
"__TRefSum: " + tumorQualitySums.get(ref) +
|
||||
"__TAltSum: " + tumorQualitySums.get(altAllele) +
|
||||
"__TRefSum: " + tumorReadPile.qualitySums.get(ref) +
|
||||
"__TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
|
||||
"__NScore:" + normalLod +
|
||||
"__NRefSum: " + normalQualitySums.get(ref) +
|
||||
"__NAltSum: " + normalQualitySums.get(altAllele) +
|
||||
"__NRefSum: " + normalReadPile.qualitySums.get(ref) +
|
||||
"__NAltSum: " + normalReadPile.qualitySums.get(altAllele) +
|
||||
"__MIDP: " + midp.get(altAllele) +
|
||||
(failedMidpointCheck?"__FAILED-MPCHECK":"")
|
||||
);
|
||||
|
|
@ -216,13 +354,13 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
} else {
|
||||
out.println(context.getLocation() + " " + upRef + " " + altAllele +
|
||||
" TScore:" + tumorLod +
|
||||
" TRefSum: " + tumorQualitySums.get(ref) +
|
||||
" TAltSum: " + tumorQualitySums.get(altAllele) +
|
||||
" TRefSum: " + tumorReadPile.qualitySums.get(ref) +
|
||||
" TAltSum: " + tumorReadPile.qualitySums.get(altAllele) +
|
||||
" NScore:" + normalLod +
|
||||
" NRefSum: " + normalQualitySums.get(ref) +
|
||||
" NAltSum: " + normalQualitySums.get(altAllele) + " " +
|
||||
tumorBases.toString() + " " +
|
||||
normalBases.toString()
|
||||
" NRefSum: " + normalReadPile.qualitySums.get(ref) +
|
||||
" NAltSum: " + normalReadPile.qualitySums.get(altAllele) + " " +
|
||||
tumorReadPile.getLocusBases().toString() + " " +
|
||||
normalReadPile.getLocusBases().toString()
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -235,34 +373,6 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
|
||||
*/
|
||||
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
|
||||
double refRef = 0;
|
||||
double altRef = 0;
|
||||
double altAlt = 0;
|
||||
for(int j=0; j<10; j++) {
|
||||
String gt = gl.genotypes[j];
|
||||
double likelihood = gl.likelihoods[j];
|
||||
|
||||
// the ref:mutant theory
|
||||
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
|
||||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
|
||||
altRef += Math.pow(10, likelihood);
|
||||
}
|
||||
|
||||
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
|
||||
altAlt += Math.pow(10, likelihood);
|
||||
}
|
||||
|
||||
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
|
||||
refRef = likelihood;
|
||||
}
|
||||
|
||||
}
|
||||
return new double[]{refRef, altRef, altAlt};
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -279,4 +389,469 @@ public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
|
||||
|
||||
int MAX_READ_MISMATCH_QUALITY_SCORE_SUM = 100;
|
||||
private LocusReadPile filterHighMismatchScoreReads(LocusReadPile pile, String reference, long leftmostIndex) {
|
||||
LocusReadPile newPile = new LocusReadPile(pile.refBase);
|
||||
|
||||
for ( int i=0; i< pile.reads.size(); i++) {
|
||||
SAMRecord read = pile.reads.get(i);
|
||||
int offset = pile.offsets.get(i);
|
||||
|
||||
AlignedRead aRead = new AlignedRead(read);
|
||||
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
|
||||
|
||||
if (mismatchScore <= MAX_READ_MISMATCH_QUALITY_SCORE_SUM) {
|
||||
newPile.add(read, offset);
|
||||
} else {
|
||||
char base = read.getReadString().charAt(offset);
|
||||
|
||||
// TODO: use a logger to output this information for debugging/Picard folks
|
||||
//System.out.println("MMQSUM: Filtering out " + read.getReadName() + " with locus base " + base + " and ref of " + pile.refBase);
|
||||
}
|
||||
}
|
||||
return newPile;
|
||||
}
|
||||
|
||||
|
||||
// also pass in the variant position
|
||||
private boolean disaligner(long mutationLocus, LocusReadPile pile, String reference, long leftmostIndex) {
|
||||
int VARIANT_SITE_MIN_QSCORE = 40; // ie 2x20
|
||||
// build a 2d array of read # vs position
|
||||
// Map<Long, Map<Character, List<Mismatch>>> matrix = new TreeMap<Long, Map<Character, List<Mismatch>>>();
|
||||
|
||||
int mutationLocusOffset = (int)(mutationLocus - leftmostIndex);
|
||||
int[][] mmm = new int[pile.reads.size()][reference.length()];
|
||||
|
||||
// for every read which has an offset
|
||||
|
||||
|
||||
for ( int i=0; i< pile.reads.size(); i++) {
|
||||
SAMRecord read = pile.reads.get(i);
|
||||
int offset = pile.offsets.get(i);
|
||||
|
||||
AlignedRead aRead = new AlignedRead(read);
|
||||
List<Mismatch> mismatches = getMismatches(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
|
||||
|
||||
// now fill in with -1 where we had reference
|
||||
// TODO: deal with other CIGAR strings?
|
||||
for(long j=read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++) {
|
||||
mmm[i][(int)( j - leftmostIndex )] = -1;
|
||||
}
|
||||
|
||||
|
||||
for(Mismatch mm : mismatches) {
|
||||
if (mm.mismatchBase == 'N') { continue; }
|
||||
|
||||
mmm[i][(int)mm.position] = mm.qualityScore;
|
||||
}
|
||||
}
|
||||
|
||||
// traverse in column-order
|
||||
int[] altSums = new int[mmm[0].length];
|
||||
for(int j=0; j<altSums.length; j++) {
|
||||
|
||||
for(int i=0; i<mmm.length; i++) {
|
||||
if (mmm[i][j] > 0) altSums[j] += mmm[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// now check that we have at least 3 alignment artifacts, and that
|
||||
// they are at least 5bp from eachother
|
||||
// FIXME: what should these numbers be?
|
||||
int varSites = 0;
|
||||
int lastVarSite = -1;
|
||||
for(int j=0; j<altSums.length; j++) {
|
||||
if (altSums[j] >= VARIANT_SITE_MIN_QSCORE) {
|
||||
if (lastVarSite > -1) {
|
||||
if (j - lastVarSite > 5) {
|
||||
varSites++;
|
||||
lastVarSite = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (varSites < 3) { return false; }
|
||||
|
||||
// now go through each read which has the alternate allele at the
|
||||
// current site (the potential somatic mutation) and see what fraction of
|
||||
// the "covered" variant bases were called as variant
|
||||
Map<Integer, Float> fractionVariantSites = new HashMap<Integer, Float>();
|
||||
for(int i=0; i<mmm.length; i++) {
|
||||
|
||||
// does this read have the variant allele?
|
||||
if (mmm[i][mutationLocusOffset] > 0) {
|
||||
float calledSites = 0;
|
||||
float variantSites = 0;
|
||||
for(int j=0; j<altSums.length; j++) {
|
||||
if (altSums[j] > VARIANT_SITE_MIN_QSCORE && j != mutationLocusOffset) {
|
||||
if (mmm[i][j] != 0) { calledSites++; }
|
||||
if (mmm[i][j] > 0) { variantSites++; }
|
||||
}
|
||||
}
|
||||
fractionVariantSites.put(i, variantSites / calledSites);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: what's the right statistic to use here?
|
||||
if (getMedian(fractionVariantSites.values()) >= 0.75) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
protected float getMedian(Collection<Float> in) {
|
||||
float[] data = new float[in.size()];
|
||||
int i=0;
|
||||
for(float f : in) { data[i++] = f; }
|
||||
|
||||
Arrays.sort(data);
|
||||
|
||||
int middle = data.length/2; // subscript of middle element
|
||||
if (data.length%2 == 1) {
|
||||
return data[middle];
|
||||
} else {
|
||||
// Even number -- return average of middle two
|
||||
// Must cast the numbers to double before dividing.
|
||||
return (data[middle-1] + data[middle]) / 2.0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void clean(LocusReadPile pile, String reference, long leftmostIndex) {
|
||||
// LocusReadPile newPile = new LocusReadPile(pile.refBase);
|
||||
|
||||
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
|
||||
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
|
||||
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
|
||||
int totalMismatchSum = 0;
|
||||
|
||||
int MIN_MISMATCH_QSCORE = 10;
|
||||
// decide which reads potentially need to be cleaned
|
||||
for ( int i=0; i< pile.reads.size(); i++) {
|
||||
SAMRecord read = pile.reads.get(i);
|
||||
int offset = pile.offsets.get(i);
|
||||
|
||||
AlignedRead aRead = new AlignedRead(read);
|
||||
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, MIN_MISMATCH_QSCORE);
|
||||
|
||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||
if ( mismatchScore > 0 ) {
|
||||
|
||||
if ( mismatchScore > 100) {
|
||||
System.out.println(read.getReadName() + " has a sum of mismatch quality scores of " + mismatchScore);
|
||||
|
||||
} else {
|
||||
altReads.add(aRead);
|
||||
altAlignmentsToTest.add(true);
|
||||
totalMismatchSum += mismatchScore;
|
||||
aRead.setMismatchScoreToReference(mismatchScore);
|
||||
}
|
||||
}
|
||||
// otherwise, we can emit it as is
|
||||
else {
|
||||
refReads.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// build a map of position ->
|
||||
Consensus bestConsensus = null;
|
||||
|
||||
// for each alternative consensus to test, align it to the reference and create an alternative consensus
|
||||
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
|
||||
if ( altAlignmentsToTest.get(index) ) {
|
||||
|
||||
// do a pairwise alignment against the reference
|
||||
AlignedRead aRead = altReads.get(index);
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
|
||||
int refIdx = swConsensus.getAlignmentStart2wrt1();
|
||||
if ( refIdx < 0 )
|
||||
continue;
|
||||
|
||||
// create the new consensus
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(reference.substring(0, refIdx));
|
||||
Cigar c = swConsensus.getCigar();
|
||||
logger.debug("CIGAR = " + cigarToString(c));
|
||||
|
||||
int indelCount = 0;
|
||||
int altIdx = 0;
|
||||
boolean ok_flag = true;
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator() ) {
|
||||
case D:
|
||||
indelCount++;
|
||||
refIdx += ce.getLength();
|
||||
break;
|
||||
case M:
|
||||
if ( reference.length() < refIdx+ce.getLength() )
|
||||
ok_flag = false;
|
||||
else
|
||||
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
|
||||
refIdx += ce.getLength();
|
||||
altIdx += ce.getLength();
|
||||
break;
|
||||
case I:
|
||||
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
|
||||
altIdx += ce.getLength();
|
||||
indelCount++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// make sure that there is at most only a single indel and it aligns appropriately!
|
||||
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
|
||||
continue;
|
||||
|
||||
sb.append(reference.substring(refIdx));
|
||||
String altConsensus = sb.toString();
|
||||
|
||||
// for each imperfect match to the reference, score it against this alternative
|
||||
Consensus consensus = new Consensus(altConsensus, c, swConsensus.getAlignmentStart2wrt1());
|
||||
for ( int j = 0; j < altReads.size(); j++ ) {
|
||||
AlignedRead toTest = altReads.get(j);
|
||||
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
|
||||
|
||||
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
|
||||
int myScore = altAlignment.getSecond();
|
||||
if ( myScore >= toTest.getMismatchScoreToReference() )
|
||||
myScore = toTest.getMismatchScoreToReference();
|
||||
// keep track of reads that align better to the alternate consensus
|
||||
else
|
||||
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.getFirst()));
|
||||
|
||||
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.getFirst());
|
||||
consensus.mismatchSum += myScore;
|
||||
if ( myScore == 0 )
|
||||
// we already know that this is its consensus, so don't bother testing it later
|
||||
altAlignmentsToTest.set(j, false);
|
||||
}
|
||||
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
|
||||
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
|
||||
bestConsensus = consensus;
|
||||
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Done with MSA");
|
||||
// // if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
|
||||
// if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) {
|
||||
// logger.debug("CLEAN: " + bestConsensus.str );
|
||||
// if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
|
||||
// StringBuffer str = new StringBuffer();
|
||||
// str.append(reads.get(0).getReferenceName());
|
||||
// int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength();
|
||||
// str.append(":" + (leftmostIndex + position));
|
||||
// CigarElement ce = bestConsensus.cigar.getCigarElement(1);
|
||||
// str.append("\t" + ce.getLength() + ce.getOperator());
|
||||
// str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
||||
// try {
|
||||
// indelOutput.write(str.toString());
|
||||
// indelOutput.flush();
|
||||
// } catch (Exception e) {}
|
||||
// }
|
||||
//
|
||||
// // We need to update the mapping quality score of the cleaned reads;
|
||||
// // however we don't have enough info to use the proper MAQ scoring system.
|
||||
// // For now, we'll use a heuristic:
|
||||
// // the mapping quality score is improved by the LOD difference in mismatching
|
||||
// // bases between the reference and alternate consensus
|
||||
// int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10;
|
||||
//
|
||||
// // clean the appropriate reads
|
||||
// for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
|
||||
// AlignedRead aRead = altReads.get(indexPair.getFirst());
|
||||
// updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
|
||||
// aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255));
|
||||
// aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // write them out
|
||||
// for ( SAMRecord rec : refReads )
|
||||
// readsToWrite.add(new ComparableSAMRecord(rec));
|
||||
// for ( AlignedRead aRec : altReads )
|
||||
// readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ----------------------------------- PRIVATE IN IntervalCleanerWalker
|
||||
public static final int MAX_QUAL = 99;
|
||||
|
||||
private static class Mismatch {
|
||||
AlignedRead aRead;
|
||||
int offset;
|
||||
long position;
|
||||
char mismatchBase;
|
||||
int qualityScore;
|
||||
|
||||
private Mismatch(AlignedRead aRead, int offset, long position, char mismatchBase, int qualityScore) {
|
||||
this.aRead = aRead;
|
||||
this.offset = offset;
|
||||
this.position = position;
|
||||
this.mismatchBase = mismatchBase;
|
||||
this.qualityScore = qualityScore;
|
||||
}
|
||||
}
|
||||
|
||||
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex) {
|
||||
return mismatchQualitySum(aRead, refSeq, refIndex, 0);
|
||||
}
|
||||
|
||||
private static int mismatchQualitySum(AlignedRead aRead, String refSeq, int refIndex, int minMismatchQualityScore) {
|
||||
List<Mismatch> mismatches = getMismatches(aRead, refSeq, refIndex);
|
||||
int sum = 0;
|
||||
for(Mismatch mm : mismatches) {
|
||||
if (mm.qualityScore >= minMismatchQualityScore) {
|
||||
sum += mm.qualityScore;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns a list of position
|
||||
* @param aRead
|
||||
* @param refSeq
|
||||
* @param refIndex
|
||||
* @param minMismatchQualityScore
|
||||
* @return
|
||||
*/
|
||||
private static List<Mismatch> getMismatches(AlignedRead aRead, String refSeq, int refIndex) {
|
||||
List<Mismatch> mismatches = new ArrayList<Mismatch>();
|
||||
|
||||
String readSeq = aRead.getReadString();
|
||||
String quals = aRead.getBaseQualityString();
|
||||
int readIndex = 0;
|
||||
int sum = 0;
|
||||
Cigar c = aRead.getCigar();
|
||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch ( ce.getOperator() ) {
|
||||
case M:
|
||||
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
|
||||
// FIXME: what is this case????
|
||||
if ( refIndex >= refSeq.length() ) {
|
||||
sum += MAX_QUAL;
|
||||
} else {
|
||||
char readBase = Character.toUpperCase(readSeq.charAt(readIndex));
|
||||
char refBase = Character.toUpperCase(refSeq.charAt(refIndex));
|
||||
|
||||
if ( readBase != refBase ) {
|
||||
int qual = (int)quals.charAt(readIndex) - 33;
|
||||
mismatches.add(new Mismatch(aRead, readIndex, refIndex, readBase, qual));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
readIndex += ce.getLength();
|
||||
break;
|
||||
case D:
|
||||
refIndex += ce.getLength();
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
private Pair<Integer, Integer> findBestOffset(String ref, AlignedRead read) {
|
||||
int attempts = ref.length() - read.getReadLength() + 1;
|
||||
int bestScore = mismatchQualitySum(read, ref, 0);
|
||||
int bestIndex = 0;
|
||||
for ( int i = 1; i < attempts; i++ ) {
|
||||
// we can't get better than 0!
|
||||
if ( bestScore == 0 )
|
||||
return new Pair<Integer, Integer>(bestIndex, 0);
|
||||
int score = mismatchQualitySum(read, ref, i);
|
||||
if ( score < bestScore ) {
|
||||
bestScore = score;
|
||||
bestIndex = i;
|
||||
}
|
||||
}
|
||||
return new Pair<Integer, Integer>(bestIndex, bestScore);
|
||||
}
|
||||
|
||||
|
||||
private class AlignedRead {
|
||||
SAMRecord read;
|
||||
int mismatchScoreToReference;
|
||||
|
||||
public AlignedRead(SAMRecord read) {
|
||||
this.read = read;
|
||||
mismatchScoreToReference = 0;
|
||||
}
|
||||
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
public String getReadString() {
|
||||
return read.getReadString();
|
||||
}
|
||||
|
||||
public int getReadLength() {
|
||||
return read.getReadLength();
|
||||
}
|
||||
|
||||
public Cigar getCigar() {
|
||||
return read.getCigar();
|
||||
}
|
||||
|
||||
public void setCigar(Cigar cigar) {
|
||||
read.setCigar(cigar);
|
||||
}
|
||||
|
||||
public String getBaseQualityString() {
|
||||
return read.getBaseQualityString();
|
||||
}
|
||||
|
||||
public void setMismatchScoreToReference(int score) {
|
||||
mismatchScoreToReference = score;
|
||||
}
|
||||
|
||||
public int getMismatchScoreToReference() {
|
||||
return mismatchScoreToReference;
|
||||
}
|
||||
}
|
||||
|
||||
private class Consensus {
|
||||
public String str;
|
||||
public int mismatchSum;
|
||||
public int positionOnReference;
|
||||
public Cigar cigar;
|
||||
public ArrayList<Pair<Integer, Integer>> readIndexes;
|
||||
|
||||
public Consensus(String str, Cigar cigar, int positionOnReference) {
|
||||
this.str = str;
|
||||
this.cigar = cigar;
|
||||
this.positionOnReference = positionOnReference;
|
||||
mismatchSum = 0;
|
||||
readIndexes = new ArrayList<Pair<Integer, Integer>>();
|
||||
}
|
||||
}
|
||||
|
||||
public static String cigarToString(Cigar cig) {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
||||
for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) {
|
||||
char c='?';
|
||||
switch ( cig.getCigarElement(i).getOperator() ) {
|
||||
case M : c = 'M'; break;
|
||||
case D : c = 'D'; break;
|
||||
case I : c = 'I'; break;
|
||||
}
|
||||
b.append(cig.getCigarElement(i).getLength());
|
||||
b.append(c);
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue