From 2235245af09ccd0bcd6ac11ce23dc7a2019569f9 Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 22 Dec 2010 15:16:15 +0000 Subject: [PATCH] PrivatePermutations generalized to compute transition counts and average probabilities (and thus was renamed). Changes in some pipelines to reflect the change. Bugfix in the batch merging pipeline (it would halt because the allele VCF for genotyping batches could become off-spec). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4894 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/ACTransitionTable.java | 256 ++++++++++++++++++ .../varianteval/PrivatePermutations.java | 165 ----------- scala/qscript/chartl/expanded_targets.q | 4 +- scala/qscript/chartl/private_mutations.q | 6 +- .../queue/pipeline/ProjectManagement.scala | 2 +- 5 files changed, 263 insertions(+), 170 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/ACTransitionTable.java delete mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/ACTransitionTable.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/ACTransitionTable.java new file mode 100755 index 000000000..03b0d0275 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/ACTransitionTable.java @@ -0,0 +1,256 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.report.tags.Analysis; +import org.broadinstitute.sting.utils.report.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Nov 22, 2010 + * Time: 12:22:08 PM + * To change this template use File | Settings | File Templates. + */ +@Analysis(name = "ACTransitionMatrix", description = "Number of additional genotypes from each new sample; random permutations") +public class ACTransitionTable extends VariantEvaluator { + private final int NUM_PERMUTATIONS = 50; + private final double LOW_GQ_PCT = 0.95; + private final double LOW_GQ_THRSH = 30.0; + private boolean initialized = false; + private long skipped = 0l; + + @DataPoint(name="Het transitions",description="AC[s] = AC[s-1]+1 and AC[s] = AC[s-1]+2 transitions") + TransitionTable transitions = null; + @DataPoint(name="Private permutations",description="Marginal increase in number of sites per sample") + PermutationCounts privatePermutations = new PermutationCounts(1,transitions); + @DataPoint(name="AC2 Permutations",description="Marginal increase in number of AC=2 sites, per sample") + PermutationCounts doubletonPermutations = new PermutationCounts(2,transitions); + @DataPoint(name="AC3 Permutations",description="Marginal increase in number of tripleton sites, per sample") + PermutationCounts tripletonPermutations = new PermutationCounts(3,transitions); + + String[][] permutations; + + public boolean enabled() { + return true; + } + + public int getComparisonOrder() { + return 2; + } + + public String getName() { + return "ACTransitionTable"; + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval != null && ! initialized ) { + this.veWalker.getLogger().warn("Initializing..."); + initialize(eval); + initialized = true; + } + + if ( isGood(eval) ) { + if ( comp != null && ! comp.isFiltered() ) { + return null; + } + + int order_offset = 0; + for ( String[] ordering : permutations ) { + int sample_offset = 0; + int variant_ac = 0; + for ( String sample : ordering ) { + if ( eval.getGenotype(sample).isHet() ) { + variant_ac++; + transitions.hetTransitionCounts[order_offset][sample_offset][variant_ac-1]++; + } else if ( eval.getGenotype(sample).isHomVar() ) { + variant_ac += 2; + transitions.homTransitionCounts[order_offset][sample_offset][variant_ac-1]++; + } else { + // todo -- note, unclear how to treat no calls. Is the hom in het,ref,ref,nocall,hom sample 4 or 5? + transitions.stationaryCounts[order_offset][sample_offset][variant_ac-1]++; + } + sample_offset ++; + } + order_offset++; + } + } else { + skipped++; + } + + return null; + } + + private boolean isGood(VariantContext vc) { + if ( vc == null || vc.isFiltered() || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites + return false; + } else { + Collection gtypes = vc.getGenotypes().values(); + int ngood = 0; + for ( Genotype g : gtypes) { + if ( g.isCalled() && g.getPhredScaledQual() >= LOW_GQ_THRSH ) { + ngood ++; + } + } + + return ( (0.0+ngood)/(0.0+gtypes.size()) >= LOW_GQ_PCT ); + } + } + + public ACTransitionTable(VariantEvalWalker parent) { + super(parent); + } + + public void initialize(VariantContext vc) { + Set permuteSamples = vc.getSampleNames(); + permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()]; + veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size())); + int offset = 0; + for ( String s : permuteSamples ) { + permutations[0][offset] = s; + offset ++; + } + + for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) { + permutations[p] = permutations[0].clone(); + for ( int o = 0; o < permutations[p].length; o ++ ) { + int r = (int) Math.floor(Math.random()*(o+1)); + String swap = permutations[p][r]; + permutations[p][r] = permutations[p][o]; + permutations[p][o] = swap; + } + } + + transitions = new TransitionTable(); + transitions.hetTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()][2*permuteSamples.size()]; + transitions.homTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()][2*permuteSamples.size()]; + transitions.stationaryCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()][2*permuteSamples.size()]; + } + + public void finalizeEvaluation() { + veWalker.getLogger().info(String.format("Skipped: %d",skipped)); + } + + class TransitionTable implements TableType { + int[][][] hetTransitionCounts; + int[][][] homTransitionCounts; + int[][][] stationaryCounts; + String[][] countAverages; + String[] rowKeys = null; + String[] colKeys = null; + + public Object[] getRowKeys() { + if ( rowKeys == null ) { + rowKeys = new String[3*hetTransitionCounts[0].length]; + for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { + rowKeys[i] = String.format("%s%d%s","Sample_",i,"_(het)"); + } + for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { + rowKeys[i] = String.format("%s%d%s","Sample_",i,"_(hom)"); + } + for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) { + rowKeys[i] = String.format("%s%d%s","Sample_",i,"_(ref)"); + } + } + + + return rowKeys; + } + + public String getCell(int x, int y) { + if ( countAverages == null ) { + countAverages = new String[hetTransitionCounts[0].length][hetTransitionCounts[0][0].length]; + for ( int idx = 0; idx < hetTransitionCounts[0][0].length; idx ++) { + for ( int sam = 0 ; sam < hetTransitionCounts[0].length; sam ++ ) { + int totalTimesAtACSample = 0; + int totalStationary = 0; + int totalAC1Shift = 0; + int totalAC2Shift = 0; + for ( int p = 0; p < hetTransitionCounts.length; p++ ) { + totalStationary += stationaryCounts[p][sam][idx]; + totalAC2Shift += (idx+2 > hetTransitionCounts[0][0].length) ? 0 : homTransitionCounts[p][sam][idx+2]; + totalAC1Shift += (idx+1 > hetTransitionCounts[0][0].length) ? 0 : hetTransitionCounts[p][sam][idx+1]; + } + totalTimesAtACSample = totalStationary+totalAC1Shift+totalAC2Shift; + countAverages[sam][idx] = String.format("%.4f", ((double) totalAC1Shift)/totalTimesAtACSample); + countAverages[sam][hetTransitionCounts[0][0].length+idx] = String.format("%.4f", ((double) totalAC2Shift)/totalTimesAtACSample); + countAverages[sam][hetTransitionCounts[0][0].length*2+idx] = String.format("%.4f",((double)totalStationary)/totalTimesAtACSample); + } + } + } + + return countAverages[x][y]; + } + + public String getName() { return "AC Transition Tables"; } + + public Object[] getColumnKeys() { + if ( colKeys == null ) { + colKeys = new String[hetTransitionCounts[0][0].length]; + for ( int ac = 0; ac < hetTransitionCounts[0][0].length; ac ++ ) { + colKeys[ac] = String.format("AC%d",ac); + } + } + + return colKeys; + } + } + + + class PermutationCounts implements TableType { + int acToExtract; + TransitionTable table; + String[] rowNames; + String[] colNames; + + public PermutationCounts(int ac, TransitionTable tTable) { + acToExtract = ac; + table = tTable; + } + + public String[] getRowKeys() { + if ( rowNames == null ) { + rowNames = new String[table.stationaryCounts.length]; + for ( int p = 0 ; p < rowNames.length; p ++ ) { + rowNames[p] = String.format("Perm%d",p+1); + } + } + + return rowNames; + } + + public String[] getColumnKeys() { + if ( colNames == null ) { + colNames = new String[table.stationaryCounts[0].length]; + for ( int s = 0 ; s < colNames.length; s ++ ) { + colNames[s] = String.format("Sample%d",s+1); + } + } + + return colNames; + } + + public Integer getCell(int x, int y) { + return table.hetTransitionCounts[x][y][acToExtract-1] + + ( (acToExtract > table.homTransitionCounts[0][0].length) ? 0 : table.homTransitionCounts[x][y][acToExtract-1]); + } + + public String getName() { + return String.format("PermutationCountsAC%d",acToExtract); + } + } + + +} + diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java deleted file mode 100755 index f266a417a..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/PrivatePermutations.java +++ /dev/null @@ -1,165 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Nov 22, 2010 - * Time: 12:22:08 PM - * To change this template use File | Settings | File Templates. - */ -@Analysis(name = "PrivatePermutations", description = "Number of additional mutations from each new sample; random permutations") -public class PrivatePermutations extends VariantEvaluator { - private final int NUM_PERMUTATIONS = 50; - private final double LOW_GQ_PCT = 0.95; - private final double LOW_GQ_THRSH = 30.0; - private boolean initialized = false; - private long skipped = 0l; - - @DataPoint(name="Marginal Number of Mutations",description="Number of additional mutations from each new sample; random permutations") - AdditionalBySample permuteCounts = null; - - String[][] permutations; - - public boolean enabled() { - return true; - } - - public int getComparisonOrder() { - return 2; - } - - public String getName() { - return "PrivatePermutations"; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval != null && ! initialized ) { - this.veWalker.getLogger().warn("Initializing..."); - initialize(eval); - initialized = true; - } - - if ( isGood(eval) ) { - if ( comp != null && ! comp.isFiltered() ) { - return null; - } - - int order_offset = 0; - for ( String[] ordering : permutations ) { - int sample_offset = 0; - for ( String sample : ordering ) { - if ( eval.getGenotype(sample).isHet() || eval.getGenotype(sample).isHomVar() ) { - break; - } - sample_offset ++; - } - - permuteCounts.additionalValue[order_offset][sample_offset]++; - order_offset++; - } - } else { - skipped++; - } - - return null; - } - - private boolean isGood(VariantContext vc) { - if ( vc == null || vc.isFiltered() || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites - return false; - } else { - Collection gtypes = vc.getGenotypes().values(); - int ngood = 0; - for ( Genotype g : gtypes) { - if ( g.getPhredScaledQual() >= LOW_GQ_THRSH ) { - ngood ++; - } - } - - return ( (0.0+ngood)/(0.0+gtypes.size()) >= LOW_GQ_PCT ); - } - } - - public PrivatePermutations(VariantEvalWalker parent) { - super(parent); - } - - public void initialize(VariantContext vc) { - Set permuteSamples = vc.getSampleNames(); - permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()]; - veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size())); - int offset = 0; - for ( String s : permuteSamples ) { - permutations[0][offset] = s; - offset ++; - } - - for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) { - permutations[p] = permutations[0].clone(); - for ( int o = 0; o < permutations[p].length; o ++ ) { - int r = (int) Math.floor(Math.random()*(o+1)); - String swap = permutations[p][r]; - permutations[p][r] = permutations[p][o]; - permutations[p][o] = swap; - } - } - - permuteCounts = new AdditionalBySample(); - permuteCounts.additionalValue = new int[NUM_PERMUTATIONS][permuteSamples.size()]; - } - - class AdditionalBySample implements TableType { - int[][] additionalValue; - //String[][] permutationNames; - String[] rowKeys = null; - String[] colKeys = null; - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - rowKeys = new String[additionalValue.length]; - for ( int i = 0; i < additionalValue.length; i ++ ) { - rowKeys[i] = String.format("%s%d","P",i); - } - } - - - return rowKeys; - } - - public String getCell(int x, int y) { - return String.format("%d",additionalValue[x][y]); - } - - public String getName() { return "Marginal Number of Mutations"; } - - public Object[] getColumnKeys() { - if ( colKeys == null ) { - colKeys = new String[additionalValue[0].length]; - for ( int i = 0; i < additionalValue[0].length; i ++ ) { - colKeys[i] = String.format("%s%d","S",i); - } - } - - return colKeys; - } - } - - public void finalizeEvaluation() { - veWalker.getLogger().info(String.format("Skipped: %d",skipped)); - } -} diff --git a/scala/qscript/chartl/expanded_targets.q b/scala/qscript/chartl/expanded_targets.q index 54d027580..4c838963b 100755 --- a/scala/qscript/chartl/expanded_targets.q +++ b/scala/qscript/chartl/expanded_targets.q @@ -34,9 +34,10 @@ class expanded_targets extends QScript { val uncleanBams : List[File] = asScalaIterable(new XReadLines(args.projectBams)).toList.map(u => new File(u)) val realign : List[RealignerTargetCreator] = uncleanBams.map(u => { var rtc : RealignerTargetCreator = new RealignerTargetCreator with GATKArgs - rtc.out = swapExt(userDir,u,".bam",".expanded.targets.txt") + rtc.out = swapExt(userDir,u,".bam",".clean.targets.interval_list") rtc.input_file :+= u.getAbsoluteFile rtc.intervals :+= cleanIntervals.outList + rtc.memoryLimit = Some(4) rtc }) val clean : List[IndelRealigner] = realign.map( u => { @@ -45,6 +46,7 @@ class expanded_targets extends QScript { cleaner.input_file = u.input_file cleaner.memoryLimit = Some(4) cleaner.out = new File(userDir+"/"+swapExt(u.out,".bam",".expanded.targets.bam").getName) + cleaner.intervals :+= cleanIntervals.outList cleaner }) diff --git a/scala/qscript/chartl/private_mutations.q b/scala/qscript/chartl/private_mutations.q index ea4edadd8..eb27230ac 100755 --- a/scala/qscript/chartl/private_mutations.q +++ b/scala/qscript/chartl/private_mutations.q @@ -59,7 +59,7 @@ class private_mutations extends QScript { var eval_all : VariantEval = vcLib.addTrait(new VariantEval) eval_all.rodBind :+= new RodBind("evalEOMI","vcf",finalMergedVCF) eval_all.noStandard = true - eval_all.E :+= "PrivatePermutations" + eval_all.E :+= "ACTransitionTable" eval_all.out = swapExt(finalMergedVCF,".vcf",".perm.csv") eval_all.reportType = Some(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) @@ -68,7 +68,7 @@ class private_mutations extends QScript { var eval_afr : VariantEval = vcLib.addTrait(new VariantEval) eval_afr.rodBind :+= new RodBind("evalAFR","VCF",extract_afr.outputVCF) eval_afr.rodBind :+= new RodBind("compEUR","VCF",extract_eur.outputVCF) - eval_afr.E :+= "PrivatePermutations" + eval_afr.E :+= "ACTransitionTable" eval_afr.out = swapExt(extract_afr.outputVCF,".vcf",".perm.csv") eval_afr.reportType = Some(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) eval_afr.noStandard = true @@ -78,7 +78,7 @@ class private_mutations extends QScript { var eval_eur : VariantEval = vcLib.addTrait(new VariantEval) eval_eur.rodBind :+= new RodBind("compAFR","VCF",extract_afr.outputVCF) eval_eur.rodBind :+= new RodBind("evalEUR","VCF",extract_eur.outputVCF) - eval_eur.E :+= "PrivatePermutations" + eval_eur.E :+= "ACTransitionTable" eval_eur.out = swapExt(extract_eur.outputVCF,".vcf",".perm.csv") eval_eur.reportType = Some(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.CSV) eval_eur.noStandard = true diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala index a1ed558e4..3e6379149 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala @@ -25,7 +25,7 @@ class ProjectManagement(stingPath: String) { @Argument(doc="Path to the reference file on disk") var ref: File = _ def commandLine = { - "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format( + "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format( vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath ) }