diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java new file mode 100755 index 000000000..a81ba4a71 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -0,0 +1,119 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.*; + +import java.util.List; +import java.util.ArrayList; + + +public class AlleleBalance implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + double ratio; + + Genotype g = genotypes.get(0); + if ( g instanceof ReadBacked && g instanceof PosteriorsBacked ) { + Pair weightedBalance = computeWeightedBalance(ref.getBase(), genotypes, pileup); + if ( weightedBalance.second == 0 ) + return null; + ratio = weightedBalance.first; + } else { + // this test doesn't make sense for homs + Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + final String bases = pileup.getBases().toUpperCase(); + if ( bases.length() == 0 ) + return null; + + ratio = computeSingleBalance(ref.getBase(), genotypeStr, bases); + } + + return new Pair("AlleleBalance", String.format("%.2f", ratio)); + } + + private double computeSingleBalance(char ref, final String genotypeStr, final String bases) { + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + int aCount = Utils.countOccurrences(a, bases); + int bCount = Utils.countOccurrences(b, bases); + + int refCount = a == ref ? aCount : bCount; + int altCount = a == ref ? bCount : aCount; + + double ratio = (double)refCount / (double)(refCount + altCount); + return ratio; + } + + // returns the ratio and then number of points which comprise it + private Pair computeWeightedBalance(char ref, List genotypes, ReadBackedPileup pileup) { + + ArrayList refBalances = new ArrayList(); + ArrayList weights = new ArrayList(); + + // accumulate ratios and weights + for ( Genotype g : genotypes ) { + + if ( !(g instanceof ReadBacked) || !(g instanceof PosteriorsBacked) ) + continue; + + final String genotypeStr = g.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + continue; + + DiploidGenotype bestGenotype = DiploidGenotype.unorderedValueOf(genotypeStr); + + // we care only about het ref calls + if ( !bestGenotype.isHetRef(ref) ) + continue; + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + char altBase = a != ref ? a : b; + + // get the base counts at this pileup (minus deletions) + ReadBackedPileup myPileup = ((ReadBacked)g).getPileup(); + + // if the pileup is null, we'll just have to use the full pileup (it's better than nothing) + if ( myPileup == null ) + myPileup = pileup; + + int[] counts = myPileup.getBasePileupAsCounts(); + int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; + int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; + + double[] posteriors = ((PosteriorsBacked)g).getPosteriors(); + posteriors = MathUtils.normalizeFromLog10(posteriors); + weights.add(posteriors[bestGenotype.ordinal()]); + refBalances.add((double)refCount / (double)(refCount + altCount)); + } + + double ratio = 0.0; + + if ( weights.size() > 0 ) { + // normalize the weights + double sum = 0.0; + for (int i = 0; i < weights.size(); i++) + sum += weights.get(i); + for (int i = 0; i < weights.size(); i++) + weights.set(i, weights.get(i) / sum); + + // calculate total weighted ratios + for (int i = 0; i < weights.size(); i++) + ratio += weights.get(i) * refBalances.get(i); + } + + return new Pair(ratio, weights.size()); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java new file mode 100755 index 000000000..85e9258ac --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class DepthOfCoverage implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + int depth = pileup.getReads().size(); + return new Pair("DoC", String.format("%d", depth)); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java new file mode 100755 index 000000000..28634d9bb --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -0,0 +1,179 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; +import cern.jet.math.Arithmetic; + +import java.util.List; + + +public class FisherStrand implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + // this test doesn't make sense for homs + Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + int allele1 = BaseUtils.simpleBaseToBaseIndex(genotypeStr.charAt(0)); + int allele2 = BaseUtils.simpleBaseToBaseIndex(genotypeStr.charAt(1)); + + Double pvalue = strandTest(pileup, allele1, allele2); + if ( pvalue == null ) + return null; + + return new Pair("FisherStrand", String.format("%.4f", pvalue)); + } + + public boolean useZeroQualityReads() { return false; } + + private Double strandTest(ReadBackedPileup pileup, int allele1, int allele2) { + int[][] table = getContingencyTable(pileup, allele1, allele2); + if ( !variantIsHet(table) ) + return null; + + double pCutoff = computePValue(table); + //printTable(table, pCutoff); + + double pValue = pCutoff; + while (rotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + table = getContingencyTable(pileup, allele1, allele2); + + while (unrotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + //System.out.printf("P-cutoff: %f\n", pCutoff); + //System.out.printf("P-value: %f\n\n", pValue); + + return pValue; + } + + private static boolean variantIsHet(int[][] table) { + return ((table[0][1] != 0 || table[0][1] != 0) && (table[1][0] != 0 || table[1][1] != 0)); + } + + private static void printTable(int[][] table, double pValue) { + System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue); + } + + private static boolean rotateTable(int[][] table) { + table[0][0] -= 1; + table[1][0] += 1; + + table[0][1] += 1; + table[1][1] -= 1; + + return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; + } + + private static boolean unrotateTable(int[][] table) { + table[0][0] += 1; + table[1][0] -= 1; + + table[0][1] -= 1; + table[1][1] += 1; + + return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; + } + + private double computePValue(int[][] table) { + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + private static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + private static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @param pileup the pileup for the locus + * @param allele1 information for the called variant + * @param allele2 information for the called variant + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(ReadBackedPileup pileup, int allele1, int allele2) { + + int[][] table = new int[2][2]; + + List reads = pileup.getReads(); + List offsets = pileup.getOffsets(); + + for (int readIndex = 0; readIndex < reads.size(); readIndex++) { + SAMRecord read = reads.get(readIndex); + int offset = offsets.get(readIndex); + + // skip over deletion sites + if ( offset == -1 ) + continue; + + int readAllele = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset)); + boolean isFW = !read.getReadNegativeStrandFlag(); + + if (readAllele == allele1 || readAllele == allele2) { + int row = (readAllele == allele1) ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + + return table; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java new file mode 100755 index 000000000..1a1277e53 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class HomopolymerRun implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + Genotype genotype = VariantAnnotator.getFirstVariant(ref.getBase(), genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + char altAllele = (genotypeStr.charAt(0) != ref.getBase() ? genotypeStr.charAt(0) : genotypeStr.charAt(1)); + + int run = computeHomopolymerRun(altAllele, ref); + return new Pair("HomopolymerRun", String.format("%d", run)); + } + + public boolean useZeroQualityReads() { return false; } + + private static int computeHomopolymerRun(char altAllele, ReferenceContext ref) { + + // TODO -- this needs to be computed in a more accurate manner + // We currently look only at direct runs of the alternate allele adjacent to this position + + char[] bases = ref.getBases(); + GenomeLoc window = ref.getWindow(); + GenomeLoc locus = ref.getLocus(); + + int refBasePos = (int)(locus.getStart() - window.getStart()); + + int leftRun = 0; + for ( int i = refBasePos - 1; i >= 0; i--) { + if ( bases[i] != altAllele ) + break; + leftRun++; + } + + int rightRun = 0; + for ( int i = refBasePos + 1; i < bases.length; i++) { + if ( bases[i] != altAllele ) + break; + rightRun++; + } + + return Math.max(leftRun, rightRun); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java new file mode 100755 index 000000000..9a83d648b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; + +import java.util.List; + + +public class MappingQualityZero implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + List reads = pileup.getReads(); + int MQ0Count = 0; + for (int i=0; i < reads.size(); i++) { + if ( reads.get(i).getMappingQuality() == 0 ) + MQ0Count++; + } + return new Pair("MAPQ0", String.format("%d", MQ0Count)); + } + + public boolean useZeroQualityReads() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java new file mode 100755 index 000000000..142c50207 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java @@ -0,0 +1,123 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.ReadBacked; +import org.broadinstitute.sting.utils.genotype.PosteriorsBacked; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import java.util.List; +import java.util.ArrayList; + +public class OnOffGenotype implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + double ratio; + + Genotype g = genotypes.get(0); + if ( g instanceof ReadBacked && g instanceof PosteriorsBacked) { + Pair weightedBalance = computeWeightedBalance(ref.getBase(), genotypes, pileup); + if ( weightedBalance.second == 0 ) + return null; + ratio = weightedBalance.first; + } else { + Genotype genotype = VariantAnnotator.getFirstVariant(ref.getBase(), genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + final String bases = pileup.getBases().toUpperCase(); + if ( bases.length() == 0 ) + return null; + + ratio = computeSingleBalance(genotypeStr, bases); + } + + return new Pair("OnOffGenotype", String.format("%.2f", ratio)); + } + + private double computeSingleBalance(final String genotypeStr, final String bases) { + + int on = 0, off = 0; + for ( char base : BaseUtils.BASES ) { + int count = BasicPileup.countBase(base, bases); + if ( Utils.countOccurrences(base, genotypeStr) > 0 ) + on += count; + else + off += count; + } + + double ratio = (double)on / (double)(on + off); + return ratio; + } + + private Pair computeWeightedBalance(char ref, List genotypes, ReadBackedPileup pileup) { + + ArrayList onOffBalances = new ArrayList(); + ArrayList weights = new ArrayList(); + + // accumulate ratios and weights + for ( Genotype g : genotypes ) { + + if ( !(g instanceof ReadBacked) || !(g instanceof PosteriorsBacked) ) + continue; + + final String genotypeStr = g.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + continue; + + DiploidGenotype bestGenotype = DiploidGenotype.unorderedValueOf(genotypeStr); + + // we care only about non-ref calls + if ( bestGenotype.isHomRef(ref) ) + continue; + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + + // get the base counts at this pileup (minus deletions) + ReadBackedPileup myPileup = ((ReadBacked)g).getPileup(); + + // if the pileup is null, we'll just have to use the full pileup (it's better than nothing) + if ( myPileup == null ) + myPileup = pileup; + + int[] counts = myPileup.getBasePileupAsCounts(); + int onCount = counts[BaseUtils.simpleBaseToBaseIndex(a)]; + if ( a != b ) + onCount += counts[BaseUtils.simpleBaseToBaseIndex(b)]; + int totalCount = 0; + for (int i = 0; i < counts.length; i++) + totalCount += counts[i]; + + double[] posteriors = ((PosteriorsBacked)g).getPosteriors(); + posteriors = MathUtils.normalizeFromLog10(posteriors); + weights.add(posteriors[bestGenotype.ordinal()]); + onOffBalances.add((double)onCount / (double)totalCount); + } + + double ratio = 0.0; + + if ( weights.size() > 0 ) { + // normalize the weights + double sum = 0.0; + for (int i = 0; i < weights.size(); i++) + sum += weights.get(i); + for (int i = 0; i < weights.size(); i++) + weights.set(i, weights.get(i) / sum); + + // calculate total weighted ratios + for (int i = 0; i < weights.size(); i++) + ratio += weights.get(i) * onOffBalances.get(i); + } + + return new Pair(ratio, weights.size()); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java new file mode 100755 index 000000000..878f739f1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; + +import java.util.List; + + +public class RMSMappingQuality implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + List reads = pileup.getReads(); + int[] qualities = new int[reads.size()]; + for (int i=0; i < reads.size(); i++) + qualities[i] = reads.get(i).getMappingQuality(); + double rms = MathUtils.rms(qualities); + return new Pair("RMSMAPQ", String.format("%.2f", rms)); + } + + public boolean useZeroQualityReads() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java new file mode 100755 index 000000000..58f5edc46 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -0,0 +1,23 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class SpanningDeletions implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + int deletions = 0; + for (Integer offset : pileup.getOffsets() ) { + if ( offset == -1 ) + deletions++; + } + return new Pair("SpanningDeletions", String.format("%d", deletions)); + } + + public boolean useZeroQualityReads() { return false; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java new file mode 100755 index 000000000..8146c4a82 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java @@ -0,0 +1,15 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + +public interface VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes); + public boolean useZeroQualityReads(); + +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java new file mode 100755 index 000000000..bf0ee18b0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -0,0 +1,265 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.playground.gatk.walkers.variantstovcf.VariantsToVCF; + +import java.util.*; +import java.io.*; + +import net.sf.samtools.SAMRecord; + + +/** + * VariantAnnotator annotates variants. + */ +//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class)) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) +@Reference(window=@Window(start=-20,stop=20)) +public class VariantAnnotator extends RodWalker { + @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) + protected File VCF_OUT; + @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) + protected String sampleName = null; + @Argument(fullName="annotations", shortName="A", doc="Annotation types to apply to variant calls", required=false) + protected String[] ANNOTATIONS; + @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations", required=false) + protected Boolean USE_ALL_ANNOTATIONS = false; + @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") + protected Boolean LIST = false; + + private VCFWriter vcfWriter; + private VCFHeader vcfHeader; + + private HashMap nonVCFsampleName = new HashMap(); + + private ArrayList requestedAnnotations; + + // mapping from class name to class + private static HashMap allAnnotations = null; + + + private static void determineAllAnnotations() { + allAnnotations = new HashMap(); + List> annotationClasses = PackageUtils.getClassesImplementingInterface(VariantAnnotation.class); + for ( Class c : annotationClasses ) { + try { + VariantAnnotation annot = (VariantAnnotation) c.newInstance(); + allAnnotations.put(c.getSimpleName().toUpperCase(), annot); + } catch (InstantiationException e) { + throw new StingException(String.format("Cannot instantiate annotation class '%s': must be concrete class", c.getSimpleName())); + } catch (IllegalAccessException e) { + throw new StingException(String.format("Cannot instantiate annotation class '%s': must have no-arg constructor", c.getSimpleName())); + } + } + } + + private void listFiltersAndExit() { + List> annotationClasses = PackageUtils.getClassesImplementingInterface(VariantAnnotation.class); + out.println("\nAvailable annotations:"); + for (int i = 0; i < annotationClasses.size(); i++) + out.println("\t" + annotationClasses.get(i).getSimpleName()); + out.println(); + System.exit(0); + } + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + + if ( LIST ) + listFiltersAndExit(); + + // get the list of all sample names from the various VCF input rods + HashSet samples = new HashSet(); + VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + + // add the non-VCF sample from the command-line, if applicable + if ( sampleName != null ) { + nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); + samples.add(sampleName.toUpperCase()); + } + + // if there are no valid samples, die + if ( samples.size() == 0 ) + throw new StingException("There are no samples input at all; use the --sampleName argument to specify one"); + + determineAllAnnotations(); + + if ( USE_ALL_ANNOTATIONS ) { + requestedAnnotations = new ArrayList(allAnnotations.values()); + } else { + requestedAnnotations = new ArrayList(); + if ( ANNOTATIONS != null ) { + for ( String requested : ANNOTATIONS ) { + + VariantAnnotation annot = allAnnotations.get(requested.toUpperCase()); + if ( annot == null ) + throw new StingException("Unknown annotation '" + requested + "'. Issue the '-ls' argument to list available annotations."); + + requestedAnnotations.add(annot); + } + } + } + + // setup the header fields + Map hInfo = new HashMap(); + hInfo.put("format", VCFWriter.VERSION); + hInfo.put("source", "VariantAnnotator"); + hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName()); + + vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter = new VCFWriter(vcfHeader, VCF_OUT); + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + + /** + * We want reads that span deletions + * + * @return true + */ + public boolean includeReadsWithDeletionAtLoci() { return true; } + + /** + * For each site of interest, annotate based on the requested annotation types + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + RODRecordList rods = tracker.getTrackData("variant", null); + // ignore places where we don't have a variant + if ( rods == null || rods.getRecords().size() == 0 ) + return 0; + + Map annotations = new HashMap(); + VariationRod variant = (VariationRod)rods.getRecords().get(0); + + // if the reference base is not ambiguous, the variant is a SNP, and it's the appropriate type, we can annotate + if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 && variant.isSNP() && variant instanceof VariantBackedByGenotype ) { + final List genotypes = ((VariantBackedByGenotype)variant).getGenotypes(); + if ( genotypes != null ) + annotations = getAnnotations(ref, context, genotypes, requestedAnnotations); + } + + writeVCF(tracker, ref, context, variant, annotations); + + return 1; + } + + public static Map getAnnotations(ReferenceContext ref, AlignmentContext context, List genotypes) { + if ( allAnnotations == null ) + determineAllAnnotations(); + return getAnnotations(ref, context, genotypes, allAnnotations.values()); + } + + public static Map getAnnotations(ReferenceContext ref, AlignmentContext context, List genotypes, Collection annotations) { + + // set up the pileup for the full collection of reads at this position + ReadBackedPileup fullPileup = new ReadBackedPileup(ref.getBase(), context); + + // also, set up the pileup for the mapping-quality-zero-free context + List reads = context.getReads(); + List offsets = context.getOffsets(); + Iterator readIter = reads.iterator(); + Iterator offsetIter = offsets.iterator(); + ArrayList MQ0freeReads = new ArrayList(); + ArrayList MQ0freeOffsets = new ArrayList(); + while ( readIter.hasNext() ) { + SAMRecord read = readIter.next(); + Integer offset = offsetIter.next(); + if ( read.getMappingQuality() > 0 ) { + MQ0freeReads.add(read); + MQ0freeOffsets.add(offset); + } + } + ReadBackedPileup MQ0freePileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), MQ0freeReads, MQ0freeOffsets); + + HashMap results = new HashMap(); + + for ( VariantAnnotation annotator : annotations) { + Pair annot = annotator.annotate(ref, (annotator.useZeroQualityReads() ? fullPileup : MQ0freePileup), genotypes); + if ( annot != null ) + results.put(annot.first, annot.second); + } + + return results; + } + + + private void writeVCF(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariationRod variant, Map annotations) { + VCFRecord rec = getVCFRecord(tracker, ref, context, variant); + if ( rec != null ) { + rec.addInfoFields(annotations); + vcfWriter.addRecord(rec); + } + } + + + private VCFRecord getVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariationRod variant) { + if ( variant instanceof RodVCF ) { + return ((RodVCF)variant).mCurrentRecord; + } else { + List gt = new ArrayList(); + Map map = new HashMap(); + return VariantsToVCF.generateVCFRecord(tracker, ref, context, vcfHeader, gt, map, nonVCFsampleName, out, false, false); + } + } + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); + + vcfWriter.close(); + } + + public static Genotype getFirstVariant(char ref, List genotypes) { + for ( Genotype g : genotypes ) { + if ( g.isVariant(ref) ) + return g; + } + return null; + } + + public static Genotype getFirstHetVariant(List genotypes) { + for ( Genotype g : genotypes ) { + if ( g.isHet() ) + return g; + } + return null; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java index 3d5041eb9..f0af463dc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java @@ -33,7 +33,7 @@ public class CallsetConcordanceWalker extends RodWalker { private VCFWriter vcfWriter; // a map of rod name to uniquified sample name - HashMap, String> rodNamesToSampleNames = new HashMap, String>(); + private HashMap, String> rodNamesToSampleNames = new HashMap, String>(); /** @@ -53,7 +53,7 @@ public class CallsetConcordanceWalker extends RodWalker { System.exit(0); } - // get the list of all sample names from the various input rods (the need to be uniquified in case there's overlap) + // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) HashSet samples = new HashSet(); VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 9443e476e..8a50d2a94 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -97,7 +97,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(context.getReads()); + ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL)); + ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index 05c67aab9..140e49b10 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -166,7 +166,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele); double[] allelePosteriors = new double[] { refPosterior, posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] }; - normalizeFromLog10(allelePosteriors); + allelePosteriors = MathUtils.normalizeFromLog10(allelePosteriors); //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]); // calculate the posterior weighted frequencies @@ -191,7 +191,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // multiply by null allele frequency priors to get AF posteriors, then normalize for (int i = 0; i < frequencyEstimationPoints; i++) alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; - normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); + alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); // calculate p(f>0) double sum = 0.0; @@ -231,34 +231,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo return HWvalues; } - private static void normalizeFromLog10(double[] array) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = findMaxEntry(array).first; - for (int i = 0; i < array.length; i++) - array[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += array[i]; - for (int i = 0; i < array.length; i++) - array[i] /= sum; - } - - // returns the maximum value in the array and its index - private static Pair findMaxEntry(double[] array) { - int index = 0; - double max = array[0]; - for (int i = 1; i < array.length; i++) { - if ( array[i] > max ) { - max = array[i]; - index = i; - } - } - return new Pair(max, index); - } - private void printAlleleFrequencyData(char ref, GenomeLoc loc) { verboseWriter.println("Location=" + loc + ", ref=" + ref); @@ -312,7 +284,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo } double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); - int bestAFguess = findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second; + int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) @@ -328,7 +300,8 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(context.getReads()); + ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL)); + ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); @@ -360,63 +333,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo if ( dbsnp != null ) ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); } - if ( locusdata instanceof ArbitraryFieldsBacked) { - ArrayList refBalances = new ArrayList(); - ArrayList onOffBalances = new ArrayList(); - ArrayList weights = new ArrayList(); - - // accumulate ratios and weights - for ( java.util.Map.Entry entry : GLs.entrySet() ) { - // determine the best genotype - Integer sorted[] = Utils.SortPermutation(entry.getValue().getPosteriors()); - DiploidGenotype bestGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; - - // we care only about het calls - if ( bestGenotype.isHetRef(ref) ) { - - // make sure the alt base is our target alt - char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2; - if ( altBase != baseOfMax ) - continue; - - // get the base counts at this pileup (minus deletions) - ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(entry.getKey()).getContext(StratifiedContext.OVERALL)); - int[] counts = pileup.getBasePileupAsCounts(); - int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; - int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; - int totalCount = 0; - for (int i = 0; i < counts.length; i++) - totalCount += counts[i]; - - // add the entries - weights.add(entry.getValue().getNormalizedPosteriors()[bestGenotype.ordinal()]); - refBalances.add((double)refCount / (double)(refCount + altCount)); - onOffBalances.add((double)(refCount + altCount) / (double)totalCount); - } - } - - if ( weights.size() > 0 ) { - // normalize the weights - double sum = 0.0; - for (int i = 0; i < weights.size(); i++) - sum += weights.get(i); - for (int i = 0; i < weights.size(); i++) - weights.set(i, weights.get(i) / sum); - - // calculate total weighted ratios - double normalizedRefRatio = 0.0; - double normalizedOnOffRatio = 0.0; - for (int i = 0; i < weights.size(); i++) { - normalizedRefRatio += weights.get(i) * refBalances.get(i); - normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i); - } - - HashMap fields = new HashMap(); - fields.put("AB", String.format("%.2f", normalizedRefRatio)); - fields.put("OO", String.format("%.2f", normalizedOnOffRatio)); - ((ArbitraryFieldsBacked)locusdata).setFields(fields); - } - } if ( locusdata instanceof SLODBacked ) { // the overall lod double overallLog10PofNull = Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index de125120a..5ff51a17f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -74,7 +74,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(discoveryGL.first.getReads()); + ((ReadBacked)call).setPileup(discoveryGL.first); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); @@ -96,29 +96,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation if ( dbsnp != null ) ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); } - if ( locusdata instanceof ArbitraryFieldsBacked) { - - DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex]; - // we care only about het calls - if ( bestGenotype.isHetRef(ref) ) { - - char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2; - - // get the base counts at this pileup (minus deletions) - int[] counts = discoveryGL.first.getBasePileupAsCounts(); - int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; - int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; - int totalCount = 0; - for (int i = 0; i < counts.length; i++) - totalCount += counts[i]; - - // set the ratios - HashMap fields = new HashMap(); - fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount))); - fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount)); - ((ArbitraryFieldsBacked)locusdata).setFields(fields); - } - } } return new Pair, GenotypeLocusData>(Arrays.asList(call), locusdata); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 73182923d..f79a14df2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -30,27 +30,23 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; -import org.broadinstitute.sting.utils.genotype.GenotypeWriter; -import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.GenotypeLocusData; +import org.broadinstitute.sting.utils.genotype.*; import java.io.File; -import java.util.HashSet; -import java.util.List; -import java.util.ArrayList; +import java.util.*; -@ReadFilters({ZeroMappingQualityReadFilter.class}) +@Reference(window=@Window(start=-20,stop=20)) public class UnifiedGenotyper extends LocusWalker, GenotypeLocusData>, Integer> { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -72,8 +68,6 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL // keep track of some metrics about our calls private CallMetrics callsMetrics; - // are we being called by another walker (which gets around the traversal-level filtering)? - private boolean calledByAnotherWalker = true; /** Enable deletions in the pileup **/ public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -134,7 +128,6 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL return; // if we got here, then we were instantiated by the GATK engine - calledByAnotherWalker = false; // create the output writer stream if ( VARIANTS_FILE != null ) @@ -143,7 +136,8 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL this.getToolkit().getArguments().referenceFile.getName(), samples); else - writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper", + writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, + "UnifiedGenotyper", this.getToolkit().getArguments().referenceFile.getName(), samples); callsMetrics = new CallMetrics(); @@ -154,28 +148,33 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL * * @param tracker the meta data tracker * @param refContext the reference base - * @param context contextual information around the locus + * @param fullContext contextual information around the locus */ - public Pair, GenotypeLocusData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) { + public Pair, GenotypeLocusData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext fullContext) { char ref = Character.toUpperCase(refContext.getBase()); if ( !BaseUtils.isRegularBase(ref) ) return null; - // because other walkers externally call this map method with their own contexts, - // we need to make sure that the reads are appropriately filtered - if ( calledByAnotherWalker ) - context = filterAlignmentContext(context); + // remove mapping quality zero reads + AlignmentContext MQ0freeContext = filterAlignmentContext(fullContext); // an optimization to speed things up when there is no coverage or when overly covered - if ( context.getReads().size() == 0 || - (UAC.MAX_READS_IN_PILEUP > 0 && context.getReads().size() > UAC.MAX_READS_IN_PILEUP) ) + if ( MQ0freeContext.getReads().size() == 0 || + (UAC.MAX_READS_IN_PILEUP > 0 && MQ0freeContext.getReads().size() > UAC.MAX_READS_IN_PILEUP) ) return null; DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); - return gcm.calculateGenotype(tracker, ref, context, priors); + Pair, GenotypeLocusData> call = gcm.calculateGenotype(tracker, ref, MQ0freeContext, priors); + + // annotate the call, if possible + if ( call != null && call.second != null && call.second instanceof ArbitraryFieldsBacked) { + Map annotations = VariantAnnotator.getAnnotations(refContext, fullContext, call.first); + ((ArbitraryFieldsBacked)call.second).setFields(annotations); + } + + return call; } - // filter the given alignment context private AlignmentContext filterAlignmentContext(AlignmentContext context) { List reads = context.getReads(); List offsets = context.getOffsets(); @@ -185,7 +184,7 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL for (int i = 0; i < reads.size(); i++) { SAMRecord read = reads.get(i); - if ( read.getMappingQuality() != 0 && read.getReadGroup() != null ) { + if ( read.getMappingQuality() != 0 ) { newReads.add(read); newOffsets.add(offsets.get(i)); } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 2aa2c44c0..ccde99503 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -262,5 +262,26 @@ public class MathUtils { return Math.sqrt(rms); } + /** + * normalizes the log10-based array + * @param array the array to be normalized + */ + public static double[] normalizeFromLog10(double[] array) { + double[] normalized = new double[array.length]; + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = Utils.findMaxEntry(array); + for (int i = 0; i < array.length; i++) + normalized[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += normalized[i]; + for (int i = 0; i < array.length; i++) + normalized[i] /= sum; + + return normalized; + } } diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index efbbed182..c43b57928 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -486,6 +486,31 @@ public class Utils { } + // returns the maximum value in the array + public static double findMaxEntry(double[] array) { + return findIndexAndMaxEntry(array).first; + } + + // returns the index of the maximum value in the array + public static int findIndexOfMaxEntry(double[] array) { + return findIndexAndMaxEntry(array).second; + } + + // returns the the maximum value and its index in the array + private static Pair findIndexAndMaxEntry(double[] array) { + if ( array.length == 0 ) + return new Pair(0.0, -1); + int index = 0; + double max = array[0]; + for (int i = 1; i < array.length; i++) { + if ( array[i] > max ) { + max = array[i]; + index = i; + } + } + return new Pair(max, index); + } + /** Returns indices of all occurrences of the specified symbol in the string */ public static int[] indexOfAll(String s, int ch) { int[] pos = new int[64]; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java index 2cc95f45f..f515bdf95 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java @@ -1,11 +1,9 @@ package org.broadinstitute.sting.utils.genotype; -import net.sf.samtools.SAMRecord; - -import java.util.List; +import org.broadinstitute.sting.utils.ReadBackedPileup; /** - * @author aaron + * @author ebanks * * Interface ReadBacked * @@ -13,16 +11,16 @@ import java.util.List; */ public interface ReadBacked { /** - * get the SAM records that back this genotype call - * @return a list of SAM records + * get the pileup that backs this genotype call + * @return a pileup */ - public List getReads(); + public ReadBackedPileup getPileup(); /** * - * @param reads the reads for this genotype + * @param pileup the pileup for this genotype */ - public void setReads(List reads); + public void setPileup(ReadBackedPileup pileup); /** * get the count of reads diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java index 23b0c2002..02fc68f5d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java @@ -109,11 +109,13 @@ public class GeliAdapter implements GenotypeWriter { GeliGenotypeCall gCall = (GeliGenotypeCall)call; char ref = gCall.getReference(); - List recs = gCall.getReads(); - int readCount = recs.size(); + int readCount = gCall.getReadCount(); double maxMappingQual = 0; - for (SAMRecord rec : recs) { - if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); + if ( gCall.getPileup() != null ) { + List recs = gCall.getPileup().getReads(); + for (SAMRecord rec : recs) { + if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); + } } double[] posteriors = gCall.getPosteriors(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java index b23cf7901..e7966d258 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java @@ -1,11 +1,8 @@ package org.broadinstitute.sting.utils.genotype.geli; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; @@ -20,7 +17,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup = null; private double[] mPosteriors; @@ -32,6 +29,8 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked /** * Generate a single sample genotype object * + * @param ref the ref character + * @param loc the genome loc */ public GeliGenotypeCall(char ref, GenomeLoc loc) { mRefBase = ref; @@ -40,7 +39,6 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked // fill in empty data mPosteriors = new double[10]; Arrays.fill(mPosteriors, Double.MIN_VALUE); - mReads = new ArrayList(); } public GeliGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError) { @@ -66,12 +64,10 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked else mNextGenotype = mRefGenotype; mPosteriors[mNextGenotype.ordinal()] = -1.0 * negLog10PError; - - mReads = new ArrayList(); } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setPosteriors(double[] posteriors) { @@ -98,7 +94,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked public String toString() { lazyEval(); return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mBestGenotype, mRefGenotype, mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mBestGenotype, mRefGenotype, mRefBase, getReadCount(), getNegLog10PError()); } private void lazyEval() { @@ -223,12 +219,12 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -237,7 +233,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked * @return the number of reads we're backed by */ public int getReadCount() { - return mReads.size(); + return (mPileup != null ? mPileup.getReads().size() : 0); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java index 6f8faec1b..864471b4b 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java @@ -67,7 +67,7 @@ public class GeliTextWriter implements GenotypeWriter { nextVrsRef = lks[9] - posteriors[DiploidGenotype.createHomGenotype(ref).ordinal()]; double maxMappingQual = 0; - List recs = gCall.getReads(); + List recs = gCall.getPileup().getReads(); int readDepth = recs.size(); for (SAMRecord rec : recs) { if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java index 999e58a05..78cddf92a 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java @@ -1,17 +1,13 @@ package org.broadinstitute.sting.utils.genotype.glf; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; /** - * @author aaron + * @author ebanks *

* Class GLFGenotypeCall *

@@ -21,7 +17,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup; private double[] mLikelihoods; private double mNegLog10PError; @@ -30,6 +26,8 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked /** * Generate a single sample genotype object, containing everything we need to represent calls out of a genotyper object * + * @param ref the ref character + * @param loc the genome loc */ public GLFGenotypeCall(char ref, GenomeLoc loc) { mRefBase = ref; @@ -39,12 +37,12 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked mLocation = loc; mLikelihoods = new double[10]; Arrays.fill(mLikelihoods, Double.MIN_VALUE); - mReads = new ArrayList(); + mPileup = null; mNegLog10PError = Double.MIN_VALUE; } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setLikelihoods(double[] likelihoods) { @@ -68,7 +66,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked public String toString() { return String.format("%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mRefBase, getReadCount(), getNegLog10PError()); } /** @@ -155,12 +153,12 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -169,7 +167,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked * @return the number of reads we're backed by */ public int getReadCount() { - return mReads.size(); + return (mPileup != null ? mPileup.getReads().size() : 0); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index a3543ad97..d87a79dd1 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -133,7 +133,9 @@ public class GLFWriter implements GenotypeWriter { obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); // transform! ... to negitive log likelihoods // calculate the RMS mapping qualities and the read depth - double rms = calculateRMS(gCall.getReads()); + double rms = 0.0; + if ( gCall.getPileup() != null ) + rms = calculateRMS(gCall.getPileup().getReads()); int readCount = gCall.getReadCount(); this.addGenotypeCall(GenomeLocParser.getContigInfo(gCall.getLocation().getContig()),(int)gCall.getLocation().getStart(),(float)rms,ref,readCount,obj); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java index c36a217ac..beafa999b 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java @@ -1,12 +1,9 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; import java.util.Arrays; -import java.util.ArrayList; -import java.util.List; /** @@ -20,7 +17,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup = null; private int mCoverage = 0; private double[] mPosteriors; @@ -41,7 +38,6 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, mPosteriors = new double[10]; Arrays.fill(mPosteriors, Double.MIN_VALUE); mSampleName = ""; - mReads = new ArrayList(); } public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, int coverage, String sample) { @@ -68,12 +64,10 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, else mNextGenotype = mRefGenotype; mPosteriors[mNextGenotype.ordinal()] = -1.0 * negLog10PError; - - mReads = new ArrayList(); } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setPosteriors(double[] posteriors) { @@ -104,7 +98,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, public String toString() { lazyEval(); return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mBestGenotype, mRefGenotype, mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mBestGenotype, mRefGenotype, mRefBase, getReadCount(), getNegLog10PError()); } private void lazyEval() { @@ -229,12 +223,12 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -243,7 +237,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, * @return the number of reads we're backed by */ public int getReadCount() { - return (mCoverage > 0 ? mCoverage : mReads.size()); + return (mCoverage > 0 ? mCoverage : (mPileup != null ? mPileup.getReads().size() : 0)); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 487f4624a..eaf24f470 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -126,14 +126,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { Map genotypeMap = genotypeListToSampleNameMap(genotypes); - // keep track of the total read depth - int totalReadDepth = 0; - for (String name : mHeader.getGenotypeSamples()) { if (genotypeMap.containsKey(name)) { Genotype gtype = genotypeMap.get(name); VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype); - totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount(); params.addGenotypeRecord(record); genotypeMap.remove(name); } else { @@ -150,7 +146,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { // info fields Map infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params); - infoFields.put("DP", String.format("%d", totalReadDepth)); // q-score double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 4f4bb7bd3..42ddd23b5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -31,8 +31,8 @@ public class VCFRecord { private double mQual; // our filter string private String mFilterString; - // our info fields - private final Map mInfoFields = new HashMap(); + // our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests) + private final Map mInfoFields = new TreeMap(); private final String mGenotypeFormatString; @@ -336,6 +336,10 @@ public class VCFRecord { public void addInfoField(String key, String value) { //System.out.printf("Adding info field %s=%s%n", key, value); this.mInfoFields.put(key, value); + + // remove the empty token if it's present + if ( mInfoFields.containsKey(".") ) + mInfoFields.remove("."); } public void printInfoFields() { diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java index aa3083ee2..0ef284ec3 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java @@ -91,7 +91,7 @@ public class RodVCFTest extends BaseTest { @Test public void testToString() { // slightly altered line, due to map ordering - final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n"; + final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n"; RodVCF vcf = getVCFObject(); VCFReader reader = new VCFReader(vcfFile); Iterator iter = vcf.createIterator("VCF", vcfFile); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java new file mode 100755 index 000000000..94bb7f7b2 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +public class VariantAnnotatorIntegrationTest extends WalkerTest { + public static String baseTestString() { + return "-T VariantAnnotator -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 20:10,000,000-10,010,000 -vcf %s"; + } + + @Test + public void testNoAnnots1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf ", 1, + Arrays.asList("4e231f16c202d88ca3adb17168af0e0f")); + executeTest("testNoAnnots1", spec); + } + + @Test + public void testNoAnnots2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf ", 1, + Arrays.asList("ef0c59e47a2afcbecf2bcef6aa01e7e7")); + executeTest("testNoAnnots2", spec); + } + + @Test + public void testAllAnnots1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf ", 1, + Arrays.asList("ced92b5ac9e2692c4d8acce1235317b6")); + executeTest("testAllAnnots1", spec); + } + + @Test + public void testAllAnnots2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf ", 1, + Arrays.asList("573a6c02f659b2c4cf014f84bd0b9c8a")); + executeTest("testAllAnnots2", spec); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java index 480219bcc..30188fc3d 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java @@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSimpleVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1, - Arrays.asList("1b8e26cd30e993da9318abd6475f38d0")); + Arrays.asList("2fc12b6f02f4cb589f2fd134e765d6b7")); executeTest("testSimpleVenn", spec); } @@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSNPConcordance() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1, - Arrays.asList("5a89b8edcdf2e3f469ac354cb1524033")); + Arrays.asList("142bcfcc6bb404cd4bd1a4624fa9a15e")); executeTest("testSNPConcordance", spec); } @@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testNWayVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1, - Arrays.asList("1dec083580b75a9c59fcb61426117134")); + Arrays.asList("e067aace9080bafdf3312632de60b4fc")); executeTest("testNWayVenn", spec); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java deleted file mode 100755 index 6eac757f6..000000000 --- a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.filters; - -import org.broadinstitute.sting.WalkerTest; -import org.junit.Test; - -import java.util.Arrays; - -public class VariantFiltrationIntegrationTest extends WalkerTest { - @Test - public void testIntervals() { - String[] md5DoC = {"d11547059193b90e2fd1eb47dac18999", "21c8e1f9dc65fdfb39347547f9b04011"}; - WalkerTestSpec spec1 = new WalkerTestSpec( - "-T VariantFiltration -X DepthOfCoverage:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5DoC)); - executeTest("testDoCFilter", spec1); - - String[] md5AlleleBalance = {"409f2b7fb193919d28c20be4dc5c4516", "a13e4ce6260bf9f33ca99dc808b8e6ad"}; - WalkerTestSpec spec2 = new WalkerTestSpec( - "-T VariantFiltration -X AlleleBalance:low=0.25,high=0.75 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5AlleleBalance)); - executeTest("testAlleleBalanceFilter", spec2); - - String[] md5Strand = {"e5806925867088edeee9f6276ab030f3", "0f7db0aad764268ee8fa3b857df8d87d"}; - WalkerTestSpec spec3 = new WalkerTestSpec( - "-T VariantFiltration -X FisherStrand:pvalue=0.0001 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Strand)); - executeTest("testStrandFilter", spec3); - - String[] md5Lod = {"2c93f56e07e320ac7a012bd4940b1dbc", "7e0c4f2b0fda85fd2891eee76c396a55"}; - WalkerTestSpec spec4 = new WalkerTestSpec( - "-T VariantFiltration -X LodThreshold:lod=10 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Lod)); - executeTest("testLodFilter", spec4); - - String[] md5MQ0 = {"fde4be5eec12be3405863649170aaaac", "3203de335621851bccf596242b079e23"}; - WalkerTestSpec spec5 = new WalkerTestSpec( - "-T VariantFiltration -X MappingQualityZero:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5MQ0)); - executeTest("testMappingQuality0Filter", spec5); - - String[] md5MQ = {"17c35a5ccd814991dbcd3be6c2f244cd", "ecc777feedea61f7b570d114c2ab89b1"}; - WalkerTestSpec spec6 = new WalkerTestSpec( - "-T VariantFiltration -X MappingQuality:min=20 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5MQ)); - executeTest("testRMSMappingQualityFilter", spec6); - - String[] md5OnOff = {"fc649a7ebd4a7b7fa739fc4fcb7704e2", "67f2e1bc025833b0fa31f47195198997"}; - WalkerTestSpec spec7 = new WalkerTestSpec( - "-T VariantFiltration -X OnOffGenotypeRatio:threshold=0.9 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5OnOff)); - executeTest("testOnOffGenotypeFilter", spec7); - - String[] md5Clusters = {"9b3aede689a25431ed2e1aac9dd73dc6", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"}; - WalkerTestSpec spec8 = new WalkerTestSpec( - "-T VariantFiltration -X ClusteredSnps:window=10,snps=3 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Clusters)); - executeTest("testClusteredSnpsFilter", spec8); - - String[] md5Indels = {"c36cb0021f90b3509e483b57f4f41f6e", "8e0e915a1cb63d7049e0671ed00101fe"}; - WalkerTestSpec spec9 = new WalkerTestSpec( - "-T VariantFiltration -X IndelArtifact -B indels,PointIndel,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.indels -B cleaned,CleanedOutSNP,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.realigner_badsnps -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Indels)); - executeTest("testIndelArtifactFilter", spec9); - - - - - - - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 24f78d34b..09548fa07 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,10 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + public class UnifiedGenotyperIntegrationTest extends WalkerTest { public static String baseTestString() { return "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s"; @@ -42,16 +46,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testMultiSamplePilot1PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, - Arrays.asList("b7e12c4011d0043024e0dd2dd5764752")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, + Arrays.asList("b992e55996942c893948ea85660478ba")); executeTest("testMultiSamplePilot1 - Point Estimate EM", spec); } @Test public void testMultiSamplePilot2PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, - Arrays.asList("89c600d72a815c09412c97f82fa2281e")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, + Arrays.asList("6d28b2af631805dc593508a21ef46a83")); executeTest("testMultiSamplePilot2 - Point Estimate EM", spec); } @@ -63,24 +67,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("a96862eb5bd3d8db143712f427e3db91")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("90c5129f298075ee0e18233b3763f25d")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @Test public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("22735bba0cb6ea3984f1d3913e376ac4")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("033390940ecc0e2dcda5559d6a1802fa")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @Test public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("580248cfb813824194bda830427ab3d6")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("8deb8b1132e7ddf28c7a0d919ce22985")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); }