basic step0 implementation

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@679 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kcibul 2009-05-12 19:49:39 +00:00
parent 57ecb7fbf1
commit 8079acb1d3
1 changed files with 257 additions and 0 deletions

View File

@ -0,0 +1,257 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
public class SomaticMutationWalker extends LocusWalker<Integer, Integer> {
protected static class QualitySums {
private int a = 0;
private int c = 0;
private int g = 0;
private int t = 0;
public int get(final char base) {
if (base == 'a' || base == 'A') { return a; }
if (base == 'c' || base == 'C') { return c; }
if (base == 'g' || base == 'G') { return g; }
if (base == 't' || base == 'T') { return t; }
throw new RuntimeException("Unknown base: " + base);
}
public void incrementSum(final char base, final byte qual) {
if (base == 'a' || base == 'A') { a += qual; }
else if (base == 'c' || base == 'C') { c += qual; }
else if (base == 'g' || base == 'G') { g += qual; }
else if (base == 't' || base == 'T') { t += qual; }
else throw new RuntimeException("Unknown base: " + base);
}
public int getOtherQualities(final char base) {
int total = a + c + g + t;
if (base == 'a' || base == 'A') { return total-a; }
else if (base == 'c' || base == 'C') { return total-c; }
else if (base == 'g' || base == 'G') { return total-g; }
else if (base == 't' || base == 'T') { return total-t; }
else throw new RuntimeException("Unknown base: " + base);
}
public void reset() {
a = 0; c = 0; g = 0; t = 0;
}
}
@Argument(fullName = "tumor_sample_name", shortName = "s1", required = true)
public String tumorSampleName;
@Argument(fullName = "normal_sample_name", shortName = "s2", required = true)
public String normalSampleName;
public void initialize() {
}
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
public static int MAX_INSERT_SIZE = 10000;
public static int MIN_MUTANT_SUM_PRETEST = 60;
public static int MIN_MUTANT_SUM = 100;
public static double TUMOR_LOD = 6.3d;
// public static double TUMOR_LOD = 1.0d;
public static double NORMAL_LOD = 2.3d;
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
Map<Character, Integer> midp = new HashMap<Character, Integer>();
midp.put('A', Integer.MAX_VALUE);
midp.put('C', Integer.MAX_VALUE);
midp.put('G', Integer.MAX_VALUE);
midp.put('T', Integer.MAX_VALUE);
// First, exclude DBSNP Sites
for ( ReferenceOrderedDatum datum : tracker.getAllRods() )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP) {
// we're at a dbsnp site... move along... nothing to see here...
return -1;
}
}
}
char upRef = Character.toUpperCase(ref);
// TODO: should we be able to call mutations at bases where ref is N?
if (upRef == 'N') { return -1; }
List<SAMRecord> reads = context.getReads();
GenotypeLikelihoods tumorGL = new GenotypeLikelihoods();
GenotypeLikelihoods normalGL = new GenotypeLikelihoods();
QualitySums tumorQualitySums = new QualitySums();
QualitySums normalQualitySums = new QualitySums();
StringBuilder tumorBases = new StringBuilder();
StringBuilder normalBases = new StringBuilder();
for ( int i = 0; i < reads.size(); i++ )
{
SAMRecord read = reads.get(i);
if (read.getNotPrimaryAlignmentFlag() ||
read.getDuplicateReadFlag() ||
read.getReadUnmappedFlag() ||
read.getMappingQuality() <= 0
// || (read.getReadPairedFlag() && (!read.getProperPairFlag() || read.getInferredInsertSize() >= MAX_INSERT_SIZE))
) {
continue;
}
String rg = (String) read.getAttribute("RG");
String sample = read.getHeader().getReadGroup(rg).getSample();
int offset = context.getOffsets().get(i);
char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset];
if (base == 'N' || base == 'n') { continue; }
if (normalSampleName.equals(sample)) {
normalGL.add(ref, base, qual);
normalQualitySums.incrementSum(base, qual);
normalBases.append(base);
} else if (tumorSampleName.equals(sample)) {
tumorGL.add(ref, base, qual);
tumorQualitySums.incrementSum(base, qual);
tumorBases.append(base);
int midDist = Math.abs((int)(read.getReadLength() / 2) - offset);
if (midDist < midp.get(base)) { midp.put(base, midDist); }
} else {
throw new RuntimeException("Unknown Sample Name: " + sample);
}
}
// pretest: if the sum of the quality scores for all non-ref alleles < 60, just quit looking now
if (tumorQualitySums.getOtherQualities(ref) < MIN_MUTANT_SUM_PRETEST) {
return -1;
}
// Test each of the poosible alternate alleles
for (char altAllele : new char[]{'A','C','G','T'}) {
if (altAllele == upRef) { continue; }
double[] tumorRefAlt = extractRefAlt(tumorGL, ref, altAllele);
double tumorLod = Math.log10(tumorRefAlt[1] + tumorRefAlt[2]) - tumorRefAlt[0];
// (i) either an adjusted quality score sum in the tumor for the mutant base must be
// at least 100 or the LOD score for mutant:ref + mutant:mutant vs ref:ref must
// be at least 6.3;
if (tumorQualitySums.get(altAllele) < MIN_MUTANT_SUM && tumorLod < TUMOR_LOD) {
continue;
}
// make sure we've seen at least 1 obs of the alternate allele within 20bp of the read-middle
if (midp.get(altAllele) > 20) {
out.println("Rejecting due to midpoint check!");
return 0;
}
double[] refAlt = extractRefAlt(normalGL, ref, altAllele);
double normalLod = refAlt[0] - Math.log10(refAlt[1] + refAlt[2]);
// (ii) the quality score sum for the mutant base in the normal must be < 50 and the
// LOD score for ref:ref vs mutant:ref + mutant:mutant must be at least 2.3.
if ( normalQualitySums.get(altAllele) > 50 || normalLod < NORMAL_LOD ) {
continue;
}
// if we're still here... we've got a somatic mutation! Output the results
// and stop looking for mutants!
out.println(context.getLocation() + " " + upRef + " " + altAllele +
" TScore:" + tumorLod +
" TRefSum: " + tumorQualitySums.get(ref) +
" TAltSum: " + tumorQualitySums.get(altAllele) +
" NScore:" + normalLod +
" NRefSum: " + normalQualitySums.get(ref) +
" NAltSum: " + normalQualitySums.get(altAllele) + " " +
tumorBases.toString() + " " +
normalBases.toString()
);
return 1;
}
return -1;
}
/**
* Extract the LOD comparing ref:ref to ref:alt and alt:alt
*/
private double[] extractRefAlt(GenotypeLikelihoods gl, char ref, char altAllele) {
double refRef = 0;
double altRef = 0;
double altAlt = 0;
for(int j=0; j<10; j++) {
String gt = gl.genotypes[j];
double likelihood = gl.likelihoods[j];
// the ref:mutant theory
if ( (gt.charAt(0) == ref && gt.charAt(1) == altAllele) ||
(gt.charAt(0) == altAllele && gt.charAt(1) == ref) ) {
altRef += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == altAllele && gt.charAt(1) == altAllele) {
altAlt += Math.pow(10, likelihood);
}
if ( gt.charAt(0) == ref && gt.charAt(1) == ref) {
refRef = likelihood;
}
}
return new double[]{refRef, altRef, altAlt};
}
// Given result of map function
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
@Override
public void onTraversalDone(Integer result) {
// out.println(String.format("FINAL - %d %d %d %d", totalSites, tumorCovered, normalCovered, somaticCovered));
}
}