From 45583755754eeffab4cede53e0f63e6512314c17 Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 16 Nov 2009 02:41:20 +0000 Subject: [PATCH] Stage 1 of the VariantFiltration refactoring is now complete. There now exists a parallel tool called VariantAnnotator which simply takes variant calls and annotates them with the same type of data that we used to use for filtering (e.g. DoC, allele balance). The output is a VCF with the INFO field appropriately annotated. VariantAnnotator can be called as a standalone walker or by another walker, as it is by the UnifiedGenotyper. UG now no longer computes any of this meta data - it relegates the task completely to the annotator (assuming the output format accepts it). This is a fairly all-encompassing check in. It involves changes to all of the UG code, bug fixes to much of the VCF code as things popped up, and other changes throughout. All integration tests pass and I've tediously confirmed that the annotation values are correct, but this framework could use some more rigorous testing. Stage 2 of the process will happen later this week. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2053 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/annotator/AlleleBalance.java | 119 ++++++++ .../walkers/annotator/DepthOfCoverage.java | 19 ++ .../gatk/walkers/annotator/FisherStrand.java | 179 ++++++++++++ .../walkers/annotator/HomopolymerRun.java | 59 ++++ .../walkers/annotator/MappingQualityZero.java | 25 ++ .../gatk/walkers/annotator/OnOffGenotype.java | 123 ++++++++ .../walkers/annotator/RMSMappingQuality.java | 25 ++ .../walkers/annotator/SpanningDeletions.java | 23 ++ .../walkers/annotator/VariantAnnotation.java | 15 + .../walkers/annotator/VariantAnnotator.java | 265 ++++++++++++++++++ .../concordance/CallsetConcordanceWalker.java | 4 +- .../genotyper/EMGenotypeCalculationModel.java | 3 +- ...JointEstimateGenotypeCalculationModel.java | 94 +------ ...PointEstimateGenotypeCalculationModel.java | 25 +- .../walkers/genotyper/UnifiedGenotyper.java | 49 ++-- .../broadinstitute/sting/utils/MathUtils.java | 21 ++ .../org/broadinstitute/sting/utils/Utils.java | 25 ++ .../sting/utils/genotype/ReadBacked.java | 16 +- .../utils/genotype/geli/GeliAdapter.java | 10 +- .../utils/genotype/geli/GeliGenotypeCall.java | 26 +- .../utils/genotype/geli/GeliTextWriter.java | 2 +- .../utils/genotype/glf/GLFGenotypeCall.java | 30 +- .../sting/utils/genotype/glf/GLFWriter.java | 4 +- .../utils/genotype/vcf/VCFGenotypeCall.java | 24 +- .../vcf/VCFGenotypeWriterAdapter.java | 5 - .../sting/utils/genotype/vcf/VCFRecord.java | 8 +- .../sting/gatk/refdata/RodVCFTest.java | 2 +- .../VariantAnnotatorIntegrationTest.java | 44 +++ .../CallsetConcordanceIntegrationTest.java | 6 +- .../VariantFiltrationIntegrationTest.java | 80 ------ .../UnifiedGenotyperIntegrationTest.java | 24 +- 31 files changed, 1051 insertions(+), 303 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java new file mode 100755 index 000000000..a81ba4a71 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -0,0 +1,119 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.*; + +import java.util.List; +import java.util.ArrayList; + + +public class AlleleBalance implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + double ratio; + + Genotype g = genotypes.get(0); + if ( g instanceof ReadBacked && g instanceof PosteriorsBacked ) { + Pair weightedBalance = computeWeightedBalance(ref.getBase(), genotypes, pileup); + if ( weightedBalance.second == 0 ) + return null; + ratio = weightedBalance.first; + } else { + // this test doesn't make sense for homs + Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + final String bases = pileup.getBases().toUpperCase(); + if ( bases.length() == 0 ) + return null; + + ratio = computeSingleBalance(ref.getBase(), genotypeStr, bases); + } + + return new Pair("AlleleBalance", String.format("%.2f", ratio)); + } + + private double computeSingleBalance(char ref, final String genotypeStr, final String bases) { + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + int aCount = Utils.countOccurrences(a, bases); + int bCount = Utils.countOccurrences(b, bases); + + int refCount = a == ref ? aCount : bCount; + int altCount = a == ref ? bCount : aCount; + + double ratio = (double)refCount / (double)(refCount + altCount); + return ratio; + } + + // returns the ratio and then number of points which comprise it + private Pair computeWeightedBalance(char ref, List genotypes, ReadBackedPileup pileup) { + + ArrayList refBalances = new ArrayList(); + ArrayList weights = new ArrayList(); + + // accumulate ratios and weights + for ( Genotype g : genotypes ) { + + if ( !(g instanceof ReadBacked) || !(g instanceof PosteriorsBacked) ) + continue; + + final String genotypeStr = g.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + continue; + + DiploidGenotype bestGenotype = DiploidGenotype.unorderedValueOf(genotypeStr); + + // we care only about het ref calls + if ( !bestGenotype.isHetRef(ref) ) + continue; + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + char altBase = a != ref ? a : b; + + // get the base counts at this pileup (minus deletions) + ReadBackedPileup myPileup = ((ReadBacked)g).getPileup(); + + // if the pileup is null, we'll just have to use the full pileup (it's better than nothing) + if ( myPileup == null ) + myPileup = pileup; + + int[] counts = myPileup.getBasePileupAsCounts(); + int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; + int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; + + double[] posteriors = ((PosteriorsBacked)g).getPosteriors(); + posteriors = MathUtils.normalizeFromLog10(posteriors); + weights.add(posteriors[bestGenotype.ordinal()]); + refBalances.add((double)refCount / (double)(refCount + altCount)); + } + + double ratio = 0.0; + + if ( weights.size() > 0 ) { + // normalize the weights + double sum = 0.0; + for (int i = 0; i < weights.size(); i++) + sum += weights.get(i); + for (int i = 0; i < weights.size(); i++) + weights.set(i, weights.get(i) / sum); + + // calculate total weighted ratios + for (int i = 0; i < weights.size(); i++) + ratio += weights.get(i) * refBalances.get(i); + } + + return new Pair(ratio, weights.size()); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java new file mode 100755 index 000000000..85e9258ac --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class DepthOfCoverage implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + int depth = pileup.getReads().size(); + return new Pair("DoC", String.format("%d", depth)); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java new file mode 100755 index 000000000..28634d9bb --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -0,0 +1,179 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; +import cern.jet.math.Arithmetic; + +import java.util.List; + + +public class FisherStrand implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + // this test doesn't make sense for homs + Genotype genotype = VariantAnnotator.getFirstHetVariant(genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + int allele1 = BaseUtils.simpleBaseToBaseIndex(genotypeStr.charAt(0)); + int allele2 = BaseUtils.simpleBaseToBaseIndex(genotypeStr.charAt(1)); + + Double pvalue = strandTest(pileup, allele1, allele2); + if ( pvalue == null ) + return null; + + return new Pair("FisherStrand", String.format("%.4f", pvalue)); + } + + public boolean useZeroQualityReads() { return false; } + + private Double strandTest(ReadBackedPileup pileup, int allele1, int allele2) { + int[][] table = getContingencyTable(pileup, allele1, allele2); + if ( !variantIsHet(table) ) + return null; + + double pCutoff = computePValue(table); + //printTable(table, pCutoff); + + double pValue = pCutoff; + while (rotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + table = getContingencyTable(pileup, allele1, allele2); + + while (unrotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + //System.out.printf("P-cutoff: %f\n", pCutoff); + //System.out.printf("P-value: %f\n\n", pValue); + + return pValue; + } + + private static boolean variantIsHet(int[][] table) { + return ((table[0][1] != 0 || table[0][1] != 0) && (table[1][0] != 0 || table[1][1] != 0)); + } + + private static void printTable(int[][] table, double pValue) { + System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue); + } + + private static boolean rotateTable(int[][] table) { + table[0][0] -= 1; + table[1][0] += 1; + + table[0][1] += 1; + table[1][1] -= 1; + + return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; + } + + private static boolean unrotateTable(int[][] table) { + table[0][0] += 1; + table[1][0] -= 1; + + table[0][1] -= 1; + table[1][1] += 1; + + return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; + } + + private double computePValue(int[][] table) { + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + private static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + private static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @param pileup the pileup for the locus + * @param allele1 information for the called variant + * @param allele2 information for the called variant + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(ReadBackedPileup pileup, int allele1, int allele2) { + + int[][] table = new int[2][2]; + + List reads = pileup.getReads(); + List offsets = pileup.getOffsets(); + + for (int readIndex = 0; readIndex < reads.size(); readIndex++) { + SAMRecord read = reads.get(readIndex); + int offset = offsets.get(readIndex); + + // skip over deletion sites + if ( offset == -1 ) + continue; + + int readAllele = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset)); + boolean isFW = !read.getReadNegativeStrandFlag(); + + if (readAllele == allele1 || readAllele == allele2) { + int row = (readAllele == allele1) ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + + return table; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java new file mode 100755 index 000000000..1a1277e53 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class HomopolymerRun implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + Genotype genotype = VariantAnnotator.getFirstVariant(ref.getBase(), genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + char altAllele = (genotypeStr.charAt(0) != ref.getBase() ? genotypeStr.charAt(0) : genotypeStr.charAt(1)); + + int run = computeHomopolymerRun(altAllele, ref); + return new Pair("HomopolymerRun", String.format("%d", run)); + } + + public boolean useZeroQualityReads() { return false; } + + private static int computeHomopolymerRun(char altAllele, ReferenceContext ref) { + + // TODO -- this needs to be computed in a more accurate manner + // We currently look only at direct runs of the alternate allele adjacent to this position + + char[] bases = ref.getBases(); + GenomeLoc window = ref.getWindow(); + GenomeLoc locus = ref.getLocus(); + + int refBasePos = (int)(locus.getStart() - window.getStart()); + + int leftRun = 0; + for ( int i = refBasePos - 1; i >= 0; i--) { + if ( bases[i] != altAllele ) + break; + leftRun++; + } + + int rightRun = 0; + for ( int i = refBasePos + 1; i < bases.length; i++) { + if ( bases[i] != altAllele ) + break; + rightRun++; + } + + return Math.max(leftRun, rightRun); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java new file mode 100755 index 000000000..9a83d648b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; + +import java.util.List; + + +public class MappingQualityZero implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + List reads = pileup.getReads(); + int MQ0Count = 0; + for (int i=0; i < reads.size(); i++) { + if ( reads.get(i).getMappingQuality() == 0 ) + MQ0Count++; + } + return new Pair("MAPQ0", String.format("%d", MQ0Count)); + } + + public boolean useZeroQualityReads() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java new file mode 100755 index 000000000..142c50207 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/OnOffGenotype.java @@ -0,0 +1,123 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.ReadBacked; +import org.broadinstitute.sting.utils.genotype.PosteriorsBacked; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import java.util.List; +import java.util.ArrayList; + +public class OnOffGenotype implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + + double ratio; + + Genotype g = genotypes.get(0); + if ( g instanceof ReadBacked && g instanceof PosteriorsBacked) { + Pair weightedBalance = computeWeightedBalance(ref.getBase(), genotypes, pileup); + if ( weightedBalance.second == 0 ) + return null; + ratio = weightedBalance.first; + } else { + Genotype genotype = VariantAnnotator.getFirstVariant(ref.getBase(), genotypes); + if ( genotype == null ) + return null; + + final String genotypeStr = genotype.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + return null; + + final String bases = pileup.getBases().toUpperCase(); + if ( bases.length() == 0 ) + return null; + + ratio = computeSingleBalance(genotypeStr, bases); + } + + return new Pair("OnOffGenotype", String.format("%.2f", ratio)); + } + + private double computeSingleBalance(final String genotypeStr, final String bases) { + + int on = 0, off = 0; + for ( char base : BaseUtils.BASES ) { + int count = BasicPileup.countBase(base, bases); + if ( Utils.countOccurrences(base, genotypeStr) > 0 ) + on += count; + else + off += count; + } + + double ratio = (double)on / (double)(on + off); + return ratio; + } + + private Pair computeWeightedBalance(char ref, List genotypes, ReadBackedPileup pileup) { + + ArrayList onOffBalances = new ArrayList(); + ArrayList weights = new ArrayList(); + + // accumulate ratios and weights + for ( Genotype g : genotypes ) { + + if ( !(g instanceof ReadBacked) || !(g instanceof PosteriorsBacked) ) + continue; + + final String genotypeStr = g.getBases().toUpperCase(); + if ( genotypeStr.length() != 2 ) + continue; + + DiploidGenotype bestGenotype = DiploidGenotype.unorderedValueOf(genotypeStr); + + // we care only about non-ref calls + if ( bestGenotype.isHomRef(ref) ) + continue; + + char a = genotypeStr.charAt(0); + char b = genotypeStr.charAt(1); + + // get the base counts at this pileup (minus deletions) + ReadBackedPileup myPileup = ((ReadBacked)g).getPileup(); + + // if the pileup is null, we'll just have to use the full pileup (it's better than nothing) + if ( myPileup == null ) + myPileup = pileup; + + int[] counts = myPileup.getBasePileupAsCounts(); + int onCount = counts[BaseUtils.simpleBaseToBaseIndex(a)]; + if ( a != b ) + onCount += counts[BaseUtils.simpleBaseToBaseIndex(b)]; + int totalCount = 0; + for (int i = 0; i < counts.length; i++) + totalCount += counts[i]; + + double[] posteriors = ((PosteriorsBacked)g).getPosteriors(); + posteriors = MathUtils.normalizeFromLog10(posteriors); + weights.add(posteriors[bestGenotype.ordinal()]); + onOffBalances.add((double)onCount / (double)totalCount); + } + + double ratio = 0.0; + + if ( weights.size() > 0 ) { + // normalize the weights + double sum = 0.0; + for (int i = 0; i < weights.size(); i++) + sum += weights.get(i); + for (int i = 0; i < weights.size(); i++) + weights.set(i, weights.get(i) / sum); + + // calculate total weighted ratios + for (int i = 0; i < weights.size(); i++) + ratio += weights.get(i) * onOffBalances.get(i); + } + + return new Pair(ratio, weights.size()); + } + + public boolean useZeroQualityReads() { return false; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java new file mode 100755 index 000000000..878f739f1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; +import net.sf.samtools.SAMRecord; + +import java.util.List; + + +public class RMSMappingQuality implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + List reads = pileup.getReads(); + int[] qualities = new int[reads.size()]; + for (int i=0; i < reads.size(); i++) + qualities[i] = reads.get(i).getMappingQuality(); + double rms = MathUtils.rms(qualities); + return new Pair("RMSMAPQ", String.format("%.2f", rms)); + } + + public boolean useZeroQualityReads() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java new file mode 100755 index 000000000..58f5edc46 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -0,0 +1,23 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + + +public class SpanningDeletions implements VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes) { + int deletions = 0; + for (Integer offset : pileup.getOffsets() ) { + if ( offset == -1 ) + deletions++; + } + return new Pair("SpanningDeletions", String.format("%d", deletions)); + } + + public boolean useZeroQualityReads() { return false; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java new file mode 100755 index 000000000..8146c4a82 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotation.java @@ -0,0 +1,15 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.List; + +public interface VariantAnnotation { + + public Pair annotate(ReferenceContext ref, ReadBackedPileup pileup, List genotypes); + public boolean useZeroQualityReads(); + +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java new file mode 100755 index 000000000..bf0ee18b0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -0,0 +1,265 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.playground.gatk.walkers.variantstovcf.VariantsToVCF; + +import java.util.*; +import java.io.*; + +import net.sf.samtools.SAMRecord; + + +/** + * VariantAnnotator annotates variants. + */ +//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class)) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) +@Reference(window=@Window(start=-20,stop=20)) +public class VariantAnnotator extends RodWalker { + @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) + protected File VCF_OUT; + @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) + protected String sampleName = null; + @Argument(fullName="annotations", shortName="A", doc="Annotation types to apply to variant calls", required=false) + protected String[] ANNOTATIONS; + @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations", required=false) + protected Boolean USE_ALL_ANNOTATIONS = false; + @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") + protected Boolean LIST = false; + + private VCFWriter vcfWriter; + private VCFHeader vcfHeader; + + private HashMap nonVCFsampleName = new HashMap(); + + private ArrayList requestedAnnotations; + + // mapping from class name to class + private static HashMap allAnnotations = null; + + + private static void determineAllAnnotations() { + allAnnotations = new HashMap(); + List> annotationClasses = PackageUtils.getClassesImplementingInterface(VariantAnnotation.class); + for ( Class c : annotationClasses ) { + try { + VariantAnnotation annot = (VariantAnnotation) c.newInstance(); + allAnnotations.put(c.getSimpleName().toUpperCase(), annot); + } catch (InstantiationException e) { + throw new StingException(String.format("Cannot instantiate annotation class '%s': must be concrete class", c.getSimpleName())); + } catch (IllegalAccessException e) { + throw new StingException(String.format("Cannot instantiate annotation class '%s': must have no-arg constructor", c.getSimpleName())); + } + } + } + + private void listFiltersAndExit() { + List> annotationClasses = PackageUtils.getClassesImplementingInterface(VariantAnnotation.class); + out.println("\nAvailable annotations:"); + for (int i = 0; i < annotationClasses.size(); i++) + out.println("\t" + annotationClasses.get(i).getSimpleName()); + out.println(); + System.exit(0); + } + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + + if ( LIST ) + listFiltersAndExit(); + + // get the list of all sample names from the various VCF input rods + HashSet samples = new HashSet(); + VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + + // add the non-VCF sample from the command-line, if applicable + if ( sampleName != null ) { + nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); + samples.add(sampleName.toUpperCase()); + } + + // if there are no valid samples, die + if ( samples.size() == 0 ) + throw new StingException("There are no samples input at all; use the --sampleName argument to specify one"); + + determineAllAnnotations(); + + if ( USE_ALL_ANNOTATIONS ) { + requestedAnnotations = new ArrayList(allAnnotations.values()); + } else { + requestedAnnotations = new ArrayList(); + if ( ANNOTATIONS != null ) { + for ( String requested : ANNOTATIONS ) { + + VariantAnnotation annot = allAnnotations.get(requested.toUpperCase()); + if ( annot == null ) + throw new StingException("Unknown annotation '" + requested + "'. Issue the '-ls' argument to list available annotations."); + + requestedAnnotations.add(annot); + } + } + } + + // setup the header fields + Map hInfo = new HashMap(); + hInfo.put("format", VCFWriter.VERSION); + hInfo.put("source", "VariantAnnotator"); + hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName()); + + vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter = new VCFWriter(vcfHeader, VCF_OUT); + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + + /** + * We want reads that span deletions + * + * @return true + */ + public boolean includeReadsWithDeletionAtLoci() { return true; } + + /** + * For each site of interest, annotate based on the requested annotation types + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + RODRecordList rods = tracker.getTrackData("variant", null); + // ignore places where we don't have a variant + if ( rods == null || rods.getRecords().size() == 0 ) + return 0; + + Map annotations = new HashMap(); + VariationRod variant = (VariationRod)rods.getRecords().get(0); + + // if the reference base is not ambiguous, the variant is a SNP, and it's the appropriate type, we can annotate + if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 && variant.isSNP() && variant instanceof VariantBackedByGenotype ) { + final List genotypes = ((VariantBackedByGenotype)variant).getGenotypes(); + if ( genotypes != null ) + annotations = getAnnotations(ref, context, genotypes, requestedAnnotations); + } + + writeVCF(tracker, ref, context, variant, annotations); + + return 1; + } + + public static Map getAnnotations(ReferenceContext ref, AlignmentContext context, List genotypes) { + if ( allAnnotations == null ) + determineAllAnnotations(); + return getAnnotations(ref, context, genotypes, allAnnotations.values()); + } + + public static Map getAnnotations(ReferenceContext ref, AlignmentContext context, List genotypes, Collection annotations) { + + // set up the pileup for the full collection of reads at this position + ReadBackedPileup fullPileup = new ReadBackedPileup(ref.getBase(), context); + + // also, set up the pileup for the mapping-quality-zero-free context + List reads = context.getReads(); + List offsets = context.getOffsets(); + Iterator readIter = reads.iterator(); + Iterator offsetIter = offsets.iterator(); + ArrayList MQ0freeReads = new ArrayList(); + ArrayList MQ0freeOffsets = new ArrayList(); + while ( readIter.hasNext() ) { + SAMRecord read = readIter.next(); + Integer offset = offsetIter.next(); + if ( read.getMappingQuality() > 0 ) { + MQ0freeReads.add(read); + MQ0freeOffsets.add(offset); + } + } + ReadBackedPileup MQ0freePileup = new ReadBackedPileup(context.getLocation(), ref.getBase(), MQ0freeReads, MQ0freeOffsets); + + HashMap results = new HashMap(); + + for ( VariantAnnotation annotator : annotations) { + Pair annot = annotator.annotate(ref, (annotator.useZeroQualityReads() ? fullPileup : MQ0freePileup), genotypes); + if ( annot != null ) + results.put(annot.first, annot.second); + } + + return results; + } + + + private void writeVCF(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariationRod variant, Map annotations) { + VCFRecord rec = getVCFRecord(tracker, ref, context, variant); + if ( rec != null ) { + rec.addInfoFields(annotations); + vcfWriter.addRecord(rec); + } + } + + + private VCFRecord getVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariationRod variant) { + if ( variant instanceof RodVCF ) { + return ((RodVCF)variant).mCurrentRecord; + } else { + List gt = new ArrayList(); + Map map = new HashMap(); + return VariantsToVCF.generateVCFRecord(tracker, ref, context, vcfHeader, gt, map, nonVCFsampleName, out, false, false); + } + } + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); + + vcfWriter.close(); + } + + public static Genotype getFirstVariant(char ref, List genotypes) { + for ( Genotype g : genotypes ) { + if ( g.isVariant(ref) ) + return g; + } + return null; + } + + public static Genotype getFirstHetVariant(List genotypes) { + for ( Genotype g : genotypes ) { + if ( g.isHet() ) + return g; + } + return null; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java index 3d5041eb9..f0af463dc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java @@ -33,7 +33,7 @@ public class CallsetConcordanceWalker extends RodWalker { private VCFWriter vcfWriter; // a map of rod name to uniquified sample name - HashMap, String> rodNamesToSampleNames = new HashMap, String>(); + private HashMap, String> rodNamesToSampleNames = new HashMap, String>(); /** @@ -53,7 +53,7 @@ public class CallsetConcordanceWalker extends RodWalker { System.exit(0); } - // get the list of all sample names from the various input rods (the need to be uniquified in case there's overlap) + // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) HashSet samples = new HashSet(); VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 9443e476e..8a50d2a94 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -97,7 +97,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(context.getReads()); + ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL)); + ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index 05c67aab9..140e49b10 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -166,7 +166,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele); double[] allelePosteriors = new double[] { refPosterior, posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] }; - normalizeFromLog10(allelePosteriors); + allelePosteriors = MathUtils.normalizeFromLog10(allelePosteriors); //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]); // calculate the posterior weighted frequencies @@ -191,7 +191,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // multiply by null allele frequency priors to get AF posteriors, then normalize for (int i = 0; i < frequencyEstimationPoints; i++) alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; - normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); + alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); // calculate p(f>0) double sum = 0.0; @@ -231,34 +231,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo return HWvalues; } - private static void normalizeFromLog10(double[] array) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = findMaxEntry(array).first; - for (int i = 0; i < array.length; i++) - array[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += array[i]; - for (int i = 0; i < array.length; i++) - array[i] /= sum; - } - - // returns the maximum value in the array and its index - private static Pair findMaxEntry(double[] array) { - int index = 0; - double max = array[0]; - for (int i = 1; i < array.length; i++) { - if ( array[i] > max ) { - max = array[i]; - index = i; - } - } - return new Pair(max, index); - } - private void printAlleleFrequencyData(char ref, GenomeLoc loc) { verboseWriter.println("Location=" + loc + ", ref=" + ref); @@ -312,7 +284,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo } double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); - int bestAFguess = findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second; + int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) @@ -328,7 +300,8 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(context.getReads()); + ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL)); + ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); @@ -360,63 +333,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo if ( dbsnp != null ) ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); } - if ( locusdata instanceof ArbitraryFieldsBacked) { - ArrayList refBalances = new ArrayList(); - ArrayList onOffBalances = new ArrayList(); - ArrayList weights = new ArrayList(); - - // accumulate ratios and weights - for ( java.util.Map.Entry entry : GLs.entrySet() ) { - // determine the best genotype - Integer sorted[] = Utils.SortPermutation(entry.getValue().getPosteriors()); - DiploidGenotype bestGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; - - // we care only about het calls - if ( bestGenotype.isHetRef(ref) ) { - - // make sure the alt base is our target alt - char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2; - if ( altBase != baseOfMax ) - continue; - - // get the base counts at this pileup (minus deletions) - ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(entry.getKey()).getContext(StratifiedContext.OVERALL)); - int[] counts = pileup.getBasePileupAsCounts(); - int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; - int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; - int totalCount = 0; - for (int i = 0; i < counts.length; i++) - totalCount += counts[i]; - - // add the entries - weights.add(entry.getValue().getNormalizedPosteriors()[bestGenotype.ordinal()]); - refBalances.add((double)refCount / (double)(refCount + altCount)); - onOffBalances.add((double)(refCount + altCount) / (double)totalCount); - } - } - - if ( weights.size() > 0 ) { - // normalize the weights - double sum = 0.0; - for (int i = 0; i < weights.size(); i++) - sum += weights.get(i); - for (int i = 0; i < weights.size(); i++) - weights.set(i, weights.get(i) / sum); - - // calculate total weighted ratios - double normalizedRefRatio = 0.0; - double normalizedOnOffRatio = 0.0; - for (int i = 0; i < weights.size(); i++) { - normalizedRefRatio += weights.get(i) * refBalances.get(i); - normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i); - } - - HashMap fields = new HashMap(); - fields.put("AB", String.format("%.2f", normalizedRefRatio)); - fields.put("OO", String.format("%.2f", normalizedOnOffRatio)); - ((ArbitraryFieldsBacked)locusdata).setFields(fields); - } - } if ( locusdata instanceof SLODBacked ) { // the overall lod double overallLog10PofNull = Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index de125120a..5ff51a17f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -74,7 +74,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation()); if ( call instanceof ReadBacked ) { - ((ReadBacked)call).setReads(discoveryGL.first.getReads()); + ((ReadBacked)call).setPileup(discoveryGL.first); } if ( call instanceof SampleBacked ) { ((SampleBacked)call).setSampleName(sample); @@ -96,29 +96,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation if ( dbsnp != null ) ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); } - if ( locusdata instanceof ArbitraryFieldsBacked) { - - DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex]; - // we care only about het calls - if ( bestGenotype.isHetRef(ref) ) { - - char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2; - - // get the base counts at this pileup (minus deletions) - int[] counts = discoveryGL.first.getBasePileupAsCounts(); - int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)]; - int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)]; - int totalCount = 0; - for (int i = 0; i < counts.length; i++) - totalCount += counts[i]; - - // set the ratios - HashMap fields = new HashMap(); - fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount))); - fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount)); - ((ArbitraryFieldsBacked)locusdata).setFields(fields); - } - } } return new Pair, GenotypeLocusData>(Arrays.asList(call), locusdata); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 73182923d..f79a14df2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -30,27 +30,23 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; -import org.broadinstitute.sting.utils.genotype.GenotypeWriter; -import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.GenotypeLocusData; +import org.broadinstitute.sting.utils.genotype.*; import java.io.File; -import java.util.HashSet; -import java.util.List; -import java.util.ArrayList; +import java.util.*; -@ReadFilters({ZeroMappingQualityReadFilter.class}) +@Reference(window=@Window(start=-20,stop=20)) public class UnifiedGenotyper extends LocusWalker, GenotypeLocusData>, Integer> { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -72,8 +68,6 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL // keep track of some metrics about our calls private CallMetrics callsMetrics; - // are we being called by another walker (which gets around the traversal-level filtering)? - private boolean calledByAnotherWalker = true; /** Enable deletions in the pileup **/ public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -134,7 +128,6 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL return; // if we got here, then we were instantiated by the GATK engine - calledByAnotherWalker = false; // create the output writer stream if ( VARIANTS_FILE != null ) @@ -143,7 +136,8 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL this.getToolkit().getArguments().referenceFile.getName(), samples); else - writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper", + writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, + "UnifiedGenotyper", this.getToolkit().getArguments().referenceFile.getName(), samples); callsMetrics = new CallMetrics(); @@ -154,28 +148,33 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL * * @param tracker the meta data tracker * @param refContext the reference base - * @param context contextual information around the locus + * @param fullContext contextual information around the locus */ - public Pair, GenotypeLocusData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) { + public Pair, GenotypeLocusData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext fullContext) { char ref = Character.toUpperCase(refContext.getBase()); if ( !BaseUtils.isRegularBase(ref) ) return null; - // because other walkers externally call this map method with their own contexts, - // we need to make sure that the reads are appropriately filtered - if ( calledByAnotherWalker ) - context = filterAlignmentContext(context); + // remove mapping quality zero reads + AlignmentContext MQ0freeContext = filterAlignmentContext(fullContext); // an optimization to speed things up when there is no coverage or when overly covered - if ( context.getReads().size() == 0 || - (UAC.MAX_READS_IN_PILEUP > 0 && context.getReads().size() > UAC.MAX_READS_IN_PILEUP) ) + if ( MQ0freeContext.getReads().size() == 0 || + (UAC.MAX_READS_IN_PILEUP > 0 && MQ0freeContext.getReads().size() > UAC.MAX_READS_IN_PILEUP) ) return null; DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); - return gcm.calculateGenotype(tracker, ref, context, priors); + Pair, GenotypeLocusData> call = gcm.calculateGenotype(tracker, ref, MQ0freeContext, priors); + + // annotate the call, if possible + if ( call != null && call.second != null && call.second instanceof ArbitraryFieldsBacked) { + Map annotations = VariantAnnotator.getAnnotations(refContext, fullContext, call.first); + ((ArbitraryFieldsBacked)call.second).setFields(annotations); + } + + return call; } - // filter the given alignment context private AlignmentContext filterAlignmentContext(AlignmentContext context) { List reads = context.getReads(); List offsets = context.getOffsets(); @@ -185,7 +184,7 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL for (int i = 0; i < reads.size(); i++) { SAMRecord read = reads.get(i); - if ( read.getMappingQuality() != 0 && read.getReadGroup() != null ) { + if ( read.getMappingQuality() != 0 ) { newReads.add(read); newOffsets.add(offsets.get(i)); } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 2aa2c44c0..ccde99503 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -262,5 +262,26 @@ public class MathUtils { return Math.sqrt(rms); } + /** + * normalizes the log10-based array + * @param array the array to be normalized + */ + public static double[] normalizeFromLog10(double[] array) { + double[] normalized = new double[array.length]; + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = Utils.findMaxEntry(array); + for (int i = 0; i < array.length; i++) + normalized[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += normalized[i]; + for (int i = 0; i < array.length; i++) + normalized[i] /= sum; + + return normalized; + } } diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index efbbed182..c43b57928 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -486,6 +486,31 @@ public class Utils { } + // returns the maximum value in the array + public static double findMaxEntry(double[] array) { + return findIndexAndMaxEntry(array).first; + } + + // returns the index of the maximum value in the array + public static int findIndexOfMaxEntry(double[] array) { + return findIndexAndMaxEntry(array).second; + } + + // returns the the maximum value and its index in the array + private static Pair findIndexAndMaxEntry(double[] array) { + if ( array.length == 0 ) + return new Pair(0.0, -1); + int index = 0; + double max = array[0]; + for (int i = 1; i < array.length; i++) { + if ( array[i] > max ) { + max = array[i]; + index = i; + } + } + return new Pair(max, index); + } + /** Returns indices of all occurrences of the specified symbol in the string */ public static int[] indexOfAll(String s, int ch) { int[] pos = new int[64]; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java index 2cc95f45f..f515bdf95 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/ReadBacked.java @@ -1,11 +1,9 @@ package org.broadinstitute.sting.utils.genotype; -import net.sf.samtools.SAMRecord; - -import java.util.List; +import org.broadinstitute.sting.utils.ReadBackedPileup; /** - * @author aaron + * @author ebanks * * Interface ReadBacked * @@ -13,16 +11,16 @@ import java.util.List; */ public interface ReadBacked { /** - * get the SAM records that back this genotype call - * @return a list of SAM records + * get the pileup that backs this genotype call + * @return a pileup */ - public List getReads(); + public ReadBackedPileup getPileup(); /** * - * @param reads the reads for this genotype + * @param pileup the pileup for this genotype */ - public void setReads(List reads); + public void setPileup(ReadBackedPileup pileup); /** * get the count of reads diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java index 23b0c2002..02fc68f5d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java @@ -109,11 +109,13 @@ public class GeliAdapter implements GenotypeWriter { GeliGenotypeCall gCall = (GeliGenotypeCall)call; char ref = gCall.getReference(); - List recs = gCall.getReads(); - int readCount = recs.size(); + int readCount = gCall.getReadCount(); double maxMappingQual = 0; - for (SAMRecord rec : recs) { - if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); + if ( gCall.getPileup() != null ) { + List recs = gCall.getPileup().getReads(); + for (SAMRecord rec : recs) { + if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); + } } double[] posteriors = gCall.getPosteriors(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java index b23cf7901..e7966d258 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliGenotypeCall.java @@ -1,11 +1,8 @@ package org.broadinstitute.sting.utils.genotype.geli; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; @@ -20,7 +17,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup = null; private double[] mPosteriors; @@ -32,6 +29,8 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked /** * Generate a single sample genotype object * + * @param ref the ref character + * @param loc the genome loc */ public GeliGenotypeCall(char ref, GenomeLoc loc) { mRefBase = ref; @@ -40,7 +39,6 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked // fill in empty data mPosteriors = new double[10]; Arrays.fill(mPosteriors, Double.MIN_VALUE); - mReads = new ArrayList(); } public GeliGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError) { @@ -66,12 +64,10 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked else mNextGenotype = mRefGenotype; mPosteriors[mNextGenotype.ordinal()] = -1.0 * negLog10PError; - - mReads = new ArrayList(); } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setPosteriors(double[] posteriors) { @@ -98,7 +94,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked public String toString() { lazyEval(); return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mBestGenotype, mRefGenotype, mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mBestGenotype, mRefGenotype, mRefBase, getReadCount(), getNegLog10PError()); } private void lazyEval() { @@ -223,12 +219,12 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -237,7 +233,7 @@ public class GeliGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked * @return the number of reads we're backed by */ public int getReadCount() { - return mReads.size(); + return (mPileup != null ? mPileup.getReads().size() : 0); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java index 6f8faec1b..864471b4b 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java @@ -67,7 +67,7 @@ public class GeliTextWriter implements GenotypeWriter { nextVrsRef = lks[9] - posteriors[DiploidGenotype.createHomGenotype(ref).ordinal()]; double maxMappingQual = 0; - List recs = gCall.getReads(); + List recs = gCall.getPileup().getReads(); int readDepth = recs.size(); for (SAMRecord rec : recs) { if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java index 999e58a05..78cddf92a 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFGenotypeCall.java @@ -1,17 +1,13 @@ package org.broadinstitute.sting.utils.genotype.glf; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; /** - * @author aaron + * @author ebanks *

* Class GLFGenotypeCall *

@@ -21,7 +17,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup; private double[] mLikelihoods; private double mNegLog10PError; @@ -30,6 +26,8 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked /** * Generate a single sample genotype object, containing everything we need to represent calls out of a genotyper object * + * @param ref the ref character + * @param loc the genome loc */ public GLFGenotypeCall(char ref, GenomeLoc loc) { mRefBase = ref; @@ -39,12 +37,12 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked mLocation = loc; mLikelihoods = new double[10]; Arrays.fill(mLikelihoods, Double.MIN_VALUE); - mReads = new ArrayList(); + mPileup = null; mNegLog10PError = Double.MIN_VALUE; } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setLikelihoods(double[] likelihoods) { @@ -68,7 +66,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked public String toString() { return String.format("%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mRefBase, getReadCount(), getNegLog10PError()); } /** @@ -155,12 +153,12 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -169,7 +167,7 @@ public class GLFGenotypeCall implements Genotype, ReadBacked, LikelihoodsBacked * @return the number of reads we're backed by */ public int getReadCount() { - return mReads.size(); + return (mPileup != null ? mPileup.getReads().size() : 0); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index a3543ad97..d87a79dd1 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -133,7 +133,9 @@ public class GLFWriter implements GenotypeWriter { obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); // transform! ... to negitive log likelihoods // calculate the RMS mapping qualities and the read depth - double rms = calculateRMS(gCall.getReads()); + double rms = 0.0; + if ( gCall.getPileup() != null ) + rms = calculateRMS(gCall.getPileup().getReads()); int readCount = gCall.getReadCount(); this.addGenotypeCall(GenomeLocParser.getContigInfo(gCall.getLocation().getContig()),(int)gCall.getLocation().getStart(),(float)rms,ref,readCount,obj); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java index c36a217ac..beafa999b 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java @@ -1,12 +1,9 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; import java.util.Arrays; -import java.util.ArrayList; -import java.util.List; /** @@ -20,7 +17,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, private final char mRefBase; private final GenomeLoc mLocation; - private List mReads; + private ReadBackedPileup mPileup = null; private int mCoverage = 0; private double[] mPosteriors; @@ -41,7 +38,6 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, mPosteriors = new double[10]; Arrays.fill(mPosteriors, Double.MIN_VALUE); mSampleName = ""; - mReads = new ArrayList(); } public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, int coverage, String sample) { @@ -68,12 +64,10 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, else mNextGenotype = mRefGenotype; mPosteriors[mNextGenotype.ordinal()] = -1.0 * negLog10PError; - - mReads = new ArrayList(); } - public void setReads(List reads) { - mReads = new ArrayList(reads); + public void setPileup(ReadBackedPileup pileup) { + mPileup = pileup; } public void setPosteriors(double[] posteriors) { @@ -104,7 +98,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, public String toString() { lazyEval(); return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError=%.2f", - getLocation(), mBestGenotype, mRefGenotype, mRefBase, mReads.size(), getNegLog10PError()); + getLocation(), mBestGenotype, mRefGenotype, mRefBase, getReadCount(), getNegLog10PError()); } private void lazyEval() { @@ -229,12 +223,12 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, } /** - * get the SAM records that back this genotype call + * get the pileup that backs this genotype call * - * @return a list of SAM records + * @return a pileup */ - public List getReads() { - return mReads; + public ReadBackedPileup getPileup() { + return mPileup; } /** @@ -243,7 +237,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, * @return the number of reads we're backed by */ public int getReadCount() { - return (mCoverage > 0 ? mCoverage : mReads.size()); + return (mCoverage > 0 ? mCoverage : (mPileup != null ? mPileup.getReads().size() : 0)); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 487f4624a..eaf24f470 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -126,14 +126,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { Map genotypeMap = genotypeListToSampleNameMap(genotypes); - // keep track of the total read depth - int totalReadDepth = 0; - for (String name : mHeader.getGenotypeSamples()) { if (genotypeMap.containsKey(name)) { Genotype gtype = genotypeMap.get(name); VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype); - totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount(); params.addGenotypeRecord(record); genotypeMap.remove(name); } else { @@ -150,7 +146,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { // info fields Map infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params); - infoFields.put("DP", String.format("%d", totalReadDepth)); // q-score double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 4f4bb7bd3..42ddd23b5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -31,8 +31,8 @@ public class VCFRecord { private double mQual; // our filter string private String mFilterString; - // our info fields - private final Map mInfoFields = new HashMap(); + // our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests) + private final Map mInfoFields = new TreeMap(); private final String mGenotypeFormatString; @@ -336,6 +336,10 @@ public class VCFRecord { public void addInfoField(String key, String value) { //System.out.printf("Adding info field %s=%s%n", key, value); this.mInfoFields.put(key, value); + + // remove the empty token if it's present + if ( mInfoFields.containsKey(".") ) + mInfoFields.remove("."); } public void printInfoFields() { diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java index aa3083ee2..0ef284ec3 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java @@ -91,7 +91,7 @@ public class RodVCFTest extends BaseTest { @Test public void testToString() { // slightly altered line, due to map ordering - final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n"; + final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n"; RodVCF vcf = getVCFObject(); VCFReader reader = new VCFReader(vcfFile); Iterator iter = vcf.createIterator("VCF", vcfFile); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java new file mode 100755 index 000000000..94bb7f7b2 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +public class VariantAnnotatorIntegrationTest extends WalkerTest { + public static String baseTestString() { + return "-T VariantAnnotator -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 20:10,000,000-10,010,000 -vcf %s"; + } + + @Test + public void testNoAnnots1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf ", 1, + Arrays.asList("4e231f16c202d88ca3adb17168af0e0f")); + executeTest("testNoAnnots1", spec); + } + + @Test + public void testNoAnnots2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf ", 1, + Arrays.asList("ef0c59e47a2afcbecf2bcef6aa01e7e7")); + executeTest("testNoAnnots2", spec); + } + + @Test + public void testAllAnnots1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf ", 1, + Arrays.asList("ced92b5ac9e2692c4d8acce1235317b6")); + executeTest("testAllAnnots1", spec); + } + + @Test + public void testAllAnnots2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf ", 1, + Arrays.asList("573a6c02f659b2c4cf014f84bd0b9c8a")); + executeTest("testAllAnnots2", spec); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java index 480219bcc..30188fc3d 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java @@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSimpleVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1, - Arrays.asList("1b8e26cd30e993da9318abd6475f38d0")); + Arrays.asList("2fc12b6f02f4cb589f2fd134e765d6b7")); executeTest("testSimpleVenn", spec); } @@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSNPConcordance() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1, - Arrays.asList("5a89b8edcdf2e3f469ac354cb1524033")); + Arrays.asList("142bcfcc6bb404cd4bd1a4624fa9a15e")); executeTest("testSNPConcordance", spec); } @@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testNWayVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1, - Arrays.asList("1dec083580b75a9c59fcb61426117134")); + Arrays.asList("e067aace9080bafdf3312632de60b4fc")); executeTest("testNWayVenn", spec); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java deleted file mode 100755 index 6eac757f6..000000000 --- a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.filters; - -import org.broadinstitute.sting.WalkerTest; -import org.junit.Test; - -import java.util.Arrays; - -public class VariantFiltrationIntegrationTest extends WalkerTest { - @Test - public void testIntervals() { - String[] md5DoC = {"d11547059193b90e2fd1eb47dac18999", "21c8e1f9dc65fdfb39347547f9b04011"}; - WalkerTestSpec spec1 = new WalkerTestSpec( - "-T VariantFiltration -X DepthOfCoverage:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5DoC)); - executeTest("testDoCFilter", spec1); - - String[] md5AlleleBalance = {"409f2b7fb193919d28c20be4dc5c4516", "a13e4ce6260bf9f33ca99dc808b8e6ad"}; - WalkerTestSpec spec2 = new WalkerTestSpec( - "-T VariantFiltration -X AlleleBalance:low=0.25,high=0.75 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5AlleleBalance)); - executeTest("testAlleleBalanceFilter", spec2); - - String[] md5Strand = {"e5806925867088edeee9f6276ab030f3", "0f7db0aad764268ee8fa3b857df8d87d"}; - WalkerTestSpec spec3 = new WalkerTestSpec( - "-T VariantFiltration -X FisherStrand:pvalue=0.0001 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Strand)); - executeTest("testStrandFilter", spec3); - - String[] md5Lod = {"2c93f56e07e320ac7a012bd4940b1dbc", "7e0c4f2b0fda85fd2891eee76c396a55"}; - WalkerTestSpec spec4 = new WalkerTestSpec( - "-T VariantFiltration -X LodThreshold:lod=10 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Lod)); - executeTest("testLodFilter", spec4); - - String[] md5MQ0 = {"fde4be5eec12be3405863649170aaaac", "3203de335621851bccf596242b079e23"}; - WalkerTestSpec spec5 = new WalkerTestSpec( - "-T VariantFiltration -X MappingQualityZero:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5MQ0)); - executeTest("testMappingQuality0Filter", spec5); - - String[] md5MQ = {"17c35a5ccd814991dbcd3be6c2f244cd", "ecc777feedea61f7b570d114c2ab89b1"}; - WalkerTestSpec spec6 = new WalkerTestSpec( - "-T VariantFiltration -X MappingQuality:min=20 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5MQ)); - executeTest("testRMSMappingQualityFilter", spec6); - - String[] md5OnOff = {"fc649a7ebd4a7b7fa739fc4fcb7704e2", "67f2e1bc025833b0fa31f47195198997"}; - WalkerTestSpec spec7 = new WalkerTestSpec( - "-T VariantFiltration -X OnOffGenotypeRatio:threshold=0.9 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5OnOff)); - executeTest("testOnOffGenotypeFilter", spec7); - - String[] md5Clusters = {"9b3aede689a25431ed2e1aac9dd73dc6", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"}; - WalkerTestSpec spec8 = new WalkerTestSpec( - "-T VariantFiltration -X ClusteredSnps:window=10,snps=3 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Clusters)); - executeTest("testClusteredSnpsFilter", spec8); - - String[] md5Indels = {"c36cb0021f90b3509e483b57f4f41f6e", "8e0e915a1cb63d7049e0671ed00101fe"}; - WalkerTestSpec spec9 = new WalkerTestSpec( - "-T VariantFiltration -X IndelArtifact -B indels,PointIndel,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.indels -B cleaned,CleanedOutSNP,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.realigner_badsnps -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", - 2, - Arrays.asList(md5Indels)); - executeTest("testIndelArtifactFilter", spec9); - - - - - - - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 24f78d34b..09548fa07 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,10 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + public class UnifiedGenotyperIntegrationTest extends WalkerTest { public static String baseTestString() { return "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s"; @@ -42,16 +46,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testMultiSamplePilot1PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, - Arrays.asList("b7e12c4011d0043024e0dd2dd5764752")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, + Arrays.asList("b992e55996942c893948ea85660478ba")); executeTest("testMultiSamplePilot1 - Point Estimate EM", spec); } @Test public void testMultiSamplePilot2PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, - Arrays.asList("89c600d72a815c09412c97f82fa2281e")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, + Arrays.asList("6d28b2af631805dc593508a21ef46a83")); executeTest("testMultiSamplePilot2 - Point Estimate EM", spec); } @@ -63,24 +67,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("a96862eb5bd3d8db143712f427e3db91")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("90c5129f298075ee0e18233b3763f25d")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @Test public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("22735bba0cb6ea3984f1d3913e376ac4")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("033390940ecc0e2dcda5559d6a1802fa")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @Test public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 50", 1, - Arrays.asList("580248cfb813824194bda830427ab3d6")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, + Arrays.asList("8deb8b1132e7ddf28c7a0d919ce22985")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); }