From a1d33f8791a522ae58e45e7cece5128ecb2a7e8d Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 14 Jul 2009 01:56:50 +0000 Subject: [PATCH] -Added walker to dump strand test results to file -Refactored strand filter to handle calls from the walker git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1229 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variants/StrandTestPrinter.java | 135 ++++++++++++++++++ .../walkers/variants/VECFisherStrand.java | 104 ++++++++------ 2 files changed, 195 insertions(+), 44 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java new file mode 100755 index 000000000..be3a15698 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java @@ -0,0 +1,135 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variants; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.io.*; + +@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData={@RMD(name="hapmap",type=HapMapGenotypeROD.class),@RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="variant",type=rodVariants.class)}) +public class StrandTestPrinter extends LocusWalker { + @Argument(fullName="concordance_out", shortName="CO", doc="File to which concordant variant stats should be written", required=true) + File CONCORDANCE_OUT = null; + @Argument(fullName="sample_name", shortName="sample", doc="Sample name (e.g. NA12878)", required=true) + String SAMPLE_NAME = null; + + private PrintWriter writer = null; + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + try { + writer = new PrintWriter(CONCORDANCE_OUT); + } catch (FileNotFoundException e) { + throw new StingException(String.format("Could not open file for writing")); + } + writer.println("location\tvariant\tlodBestVsRef\tStrandScore\tForwardCount\tReverseCount\tInHapmap\tIndbSNP\tOurCall\tHapMapCall"); + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + /** + * For each site of interest, rescore the genotype likelihoods by applying the specified feature set. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus is a het site, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + HapMapGenotypeROD hapmap = (HapMapGenotypeROD)tracker.lookup("hapmap", null); + rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbsnp", null); + rodVariants variant = (rodVariants)tracker.lookup("variant", null); + + StringBuffer sb = new StringBuffer(); + char refUpper = Character.toUpperCase(ref); + + boolean realHetVariant = false, realVariant = false, calledHetVariant = false; + if ( hapmap != null ) { + String genotype = hapmap.get(SAMPLE_NAME).toUpperCase(); + if ( genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper ) { + realVariant = true; + if ( genotype.charAt(0) != genotype.charAt(1) ) { + realHetVariant = true; + sb.append(context.getLocation() + "\t" + genotype + "\t"); + } + } + } + if ( variant != null ) { + String genotype = variant.getBestGenotype().toUpperCase(); + if ( (genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper) && + (genotype.charAt(0) != genotype.charAt(1)) ) { + calledHetVariant = true; + if ( !realHetVariant ) + sb.append(context.getLocation() + "\t" + genotype + "\t"); + } + } + + if ( !realHetVariant && !calledHetVariant ) + return 0; + + if ( variant != null ) + sb.append(variant.getLodBtr() + "\t"); + else + sb.append("-1\t"); + + int allele1, allele2; + if ( hapmap != null ) { + allele1 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(0)); + allele2 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(1)); + } else { + allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0)); + allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1)); + } + VECFisherStrand.strandTest(ref, context, allele1, allele2, -1, sb); + + sb.append((hapmap != null ? "1" : "0") + "\t" + (dbsnp != null ? "1" : "0") + "\t"); + if ( variant == null) + sb.append("REF\t"); + else if ( calledHetVariant ) + sb.append("HET\t"); + else + sb.append("HOMNONREF\t"); + + if ( realHetVariant ) + sb.append("HET\n"); + else if ( realVariant ) + sb.append("HOMNONREF\n"); + else + sb.append("REF\n"); + + writer.print(sb.toString()); + + return 0; + } + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of hets processed and close out the files. + * + * @param result the number of variants seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d hets.\n", result); + + writer.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java index cd60c942a..8eb53b5bc 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.rodVariants; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.ReadBackedPileup; import net.sf.samtools.SAMRecord; import java.util.List; @@ -11,8 +10,13 @@ import java.util.List; import cern.jet.math.Arithmetic; public class VECFisherStrand implements VariantExclusionCriterion { + + private double pvalueLimit = 0.0001; + public void initialize(String arguments) { - //if (arguments != null && !arguments.isEmpty()) {} + if (arguments != null && !arguments.isEmpty()) { + pvalueLimit = Double.valueOf(arguments); + } } public boolean exclude(char ref, LocusContext context, rodVariants variant) { @@ -20,48 +24,62 @@ public class VECFisherStrand implements VariantExclusionCriterion { int allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1)); if (allele1 != allele2) { - int[][] table = getContingencyTable(context, variant); - - double pCutoff = computePValue(table); - //printTable(table, pCutoff); - - double pValue = 0.0; - while (rotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - table = getContingencyTable(context, variant); - - while (unrotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - //System.out.printf("P-cutoff: %f\n", pCutoff); - //System.out.printf("P-value: %f\n\n", pValue); - - return pValue < 0.05; + return strandTest(ref, context, allele1, allele2, pvalueLimit, null); } return false; } + public static boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) { + int[][] table = getContingencyTable(context, allele1, allele2); + + double pCutoff = computePValue(table); + //printTable(table, pCutoff); + + double pValue = 0.0; + while (rotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + table = getContingencyTable(context, allele1, allele2); + + while (unrotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + //System.out.printf("P-cutoff: %f\n", pCutoff); + //System.out.printf("P-value: %f\n\n", pValue); + + // optionally print out the pvalue and the alternate allele counts + if ( out != null ) { + int refBase = BaseUtils.simpleBaseToBaseIndex(ref); + table = getContingencyTable(context, allele1, allele2); + if ( allele1 != refBase ) + out.append(pValue + "\t" + table[0][0] + "\t" + table[0][1] + "\t"); + else + out.append(pValue + "\t" + table[1][0] + "\t" + table[1][1] + "\t"); + } + + return pValue < threshold; + } + private void printTable(int[][] table, double pValue) { System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue); } - private boolean rotateTable(int[][] table) { + private static boolean rotateTable(int[][] table) { table[0][0] -= 1; table[1][0] += 1; @@ -71,7 +89,7 @@ public class VECFisherStrand implements VariantExclusionCriterion { return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; } - private boolean unrotateTable(int[][] table) { + private static boolean unrotateTable(int[][] table) { table[0][0] += 1; table[1][0] -= 1; @@ -81,7 +99,7 @@ public class VECFisherStrand implements VariantExclusionCriterion { return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; } - private double computePValue(int[][] table) { + private static double computePValue(int[][] table) { double pCutoff = 1.0; int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; @@ -102,7 +120,7 @@ public class VECFisherStrand implements VariantExclusionCriterion { return pCutoff; } - private int sumRow(int[][] table, int column) { + private static int sumRow(int[][] table, int column) { int sum = 0; for (int r = 0; r < table.length; r++) { sum += table[r][column]; @@ -111,7 +129,7 @@ public class VECFisherStrand implements VariantExclusionCriterion { return sum; } - private int sumColumn(int[][] table, int row) { + private static int sumColumn(int[][] table, int row) { int sum = 0; for (int c = 0; c < table[row].length; c++) { sum += table[row][c]; @@ -126,16 +144,14 @@ public class VECFisherStrand implements VariantExclusionCriterion { * allele1 # # * allele2 # # * @param context the context for the locus - * @param variant information for the called variant + * @param allele1 information for the called variant + * @param allele2 information for the called variant * @return a 2x2 contingency table */ - private int[][] getContingencyTable(LocusContext context, rodVariants variant) { + private static int[][] getContingencyTable(LocusContext context, int allele1, int allele2) { int[][] table = new int[2][2]; - int allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0)); - int allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1)); - List reads = context.getReads(); List offsets = context.getOffsets();