Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2012-02-03 13:45:11 -05:00 · 2012-02-03 13:45:11 -05:00 · 0e44430e47
parent 601e53d633 aa3638ecb3
commit 0e44430e47
4 changed files with 171 additions and 29 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -17,16 +18,13 @@ import java.util.*;

 /**
 * Created by IntelliJ IDEA.
- * User: rpoplin, lfran
+ * User: rpoplin, lfran, ebanks
 * Date: 11/14/11
 */

-public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
+public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {

    private Set<Sample> trios = null;
-    private final static int REF = 0;
-    private final static int HET = 1;
-    private final static int HOM = 2;
    private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information

    public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
            }
        }

-        final Map<String,Object> toRet = new HashMap<String,Object>(1);
+        final Map<String, Object> toRet = new HashMap<String, Object>(1);
        final HashSet<Sample> triosToTest = new HashSet<Sample>();

-        for( final Sample child : trios) {
+        for( final Sample child : trios ) {
            final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
                    vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
                    vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
@ -65,28 +63,54 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
    // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
    private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {

-        final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET);
-        final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET);
-        final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
-        final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
-        final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF);
-        final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF);
+        double nABGivenABandBB = 0.0;
+        double nBBGivenABandBB = 0.0;
+        double nAAGivenABandAB = 0.0;
+        double nBBGivenABandAB = 0.0;
+        double nAAGivenAAandAB = 0.0;
+        double nABGivenAAandAB = 0.0;
+
+        // for each pair of alleles, add the likelihoods
+        int numAlleles = vc.getNAlleles();
+        for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) {
+            for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) {
+
+                // TODO -- cache these for better performance
+                final int HOM1index = determineHomIndex(allele1, numAlleles);
+                final int HETindex = HOM1index + (allele2 - allele1);
+                final int HOM2index = determineHomIndex(allele2, numAlleles);
+
+                nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex);
+                nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex);
+                nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex);
+                nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex);
+                nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index);
+                nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index);
+            }
+        }

        final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
        final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
        return (numer * numer) / denom;
    }

-    private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) {
+    private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int momIdx, final int dadIdx ) {
        final double likelihoodVector[] = new double[triosToTest.size()];
        int iii = 0;
        for( final Sample child : triosToTest ) {
            final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
            final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
            final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
-            likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx];
+            likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx];
        }

        return MathUtils.sumLog10(likelihoodVector);
    }
+    
+    private static int determineHomIndex(final int alleleIndex, int numAlleles) {
+        int result = 0;
+        for ( int i = 0; i < alleleIndex; i++ )
+            result += numAlleles--;
+        return result;
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java
@ -114,7 +114,7 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
                String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
                String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
                String pheno = mVals.get("phenotype");
-                outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
+                outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
            }
        }
    }
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala
@ -0,0 +1,88 @@
+package org.broadinstitute.sting.queue.qscripts.lib
+
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
+import scala.collection.JavaConversions._
+import org.broadinstitute.sting.utils.text.XReadLines
+import java.io.PrintStream
+import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: chartl
+ * Date: 2/2/12
+ * Time: 12:13 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+class ChunkVCF extends QScript {
+
+  @Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true)
+  var inVCF : File = _
+
+  @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true)
+  var numEntries : Int = _
+
+  @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.")
+  var intervals : File = _
+
+  @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false)
+  var preserve : Boolean = false
+
+  @Input(fullName="reference",doc="The reference file",required=false)
+  var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
+
+  @Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false)
+  var extractSamples : File = _
+
+  val tmpdir : File = System.getProperty("java.io.tmpdir")
+
+  def script = {
+    if ( intervals == null ) {
+      // create an interval list from the VCF
+      val ivals : File = swapExt(variants,".vcf",".intervals.list")
+      val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
+      add(extract)
+    } else {
+      var chunkNum = 1
+      var numLinesInChunk = 0
+      var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0)
+      var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
+      var chunkWriter = new PrintStream(chunkFile)
+      asScalaIterator(new XReadLines(intervals)).foreach( int => {
+        // check new chromosome or full chunk
+        if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) {
+          chunkWriter.close()
+          val chunkSelect : SelectVariants = new SelectVariants
+          chunkSelect.reference_sequence = ref
+          chunkSelect.memoryLimit = 2
+          chunkSelect.intervals :+= chunkFile
+          if ( extractSamples != null )
+            chunkSelect.sample_file = extractSamples
+          chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
+          add(chunkSelect)
+          chunkNum += 1
+          numLinesInChunk = 0
+          chromosome = int.split(":")(0)
+          chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
+          chunkWriter = new PrintStream(chunkFile)
+        }
+        chunkWriter.printf("%s%n",int)
+        numLinesInChunk += 1
+      })
+      // last chunk
+      if ( numLinesInChunk > 0 ) {
+        // some work to do
+        val chunkSelect : SelectVariants = new SelectVariants
+        chunkSelect.reference_sequence = ref
+        chunkSelect.memoryLimit = 2
+        chunkSelect.intervals :+= chunkFile
+        chunkWriter.close()
+        if ( extractSamples != null )
+          chunkSelect.sample_file = extractSamples
+        chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
+        add(chunkSelect)
+      }
+    }
+  }
+}
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala
@ -6,7 +6,7 @@ import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
 import org.broadinstitute.sting.utils.text.XReadLines
 import collection.JavaConversions._
 import java.io._
-import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
+import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed}

 /**
 * Created by IntelliJ IDEA.
@ -31,11 +31,14 @@ class VcfToPed extends QScript {
  var intervals : File = _

  @Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
-  var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
+  var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")

  @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
  var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")

+  @Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file")
+  var samFile : File = _
+
  val tmpdir : File = System.getProperty("java.io.tmpdir")

  def script = {
@ -59,14 +62,27 @@ class VcfToPed extends QScript {
          val toPed : VariantsToPed = new VariantsToPed
          toPed.memoryLimit = 2
          toPed.reference_sequence = ref
-          toPed.intervals :+= new File(subListFile)
+          toPed.intervals :+= subListFile
          toPed.dbsnp = dbsnp
-          toPed.variant = variants
+          if ( samFile != null ) {
+            val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
+            val extract : SelectVariants = new SelectVariants
+            extract.reference_sequence = ref
+            extract.memoryLimit = 2
+            extract.intervals :+= subListFile
+            extract.variant = variants
+            extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
+            extract.sample_file :+= samFile
+            add(extract)
+            toPed.variant = extract.out
+          } else {
+            toPed.variant = variants
+          }
          toPed.metaData = meta
-          lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
-          lazy val tBed = new File(tmpdir,base+".bed")
-          lazy val bim = new File(tmpdir,base+".bim")
-          lazy val fam = new File(tmpdir,base+".fam")
+          val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
+          val tBed = new File(tmpdir,base+".bed")
+          val bim = new File(tmpdir,base+".bim")
+          val fam = new File(tmpdir,base+".fam")
          toPed.bed = tBed
          toPed.bim = bim
          toPed.fam = fam
@ -87,12 +103,26 @@ class VcfToPed extends QScript {
        toPed.reference_sequence = ref
        toPed.intervals :+= new File(subListFile)
        toPed.dbsnp = dbsnp
-        toPed.variant = variants
+        if ( samFile != null ) {
+          val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
+          val extract : SelectVariants = new SelectVariants
+          extract.reference_sequence = ref
+          extract.memoryLimit = 2
+          extract.intervals :+= subListFile
+          extract.variant = variants
+          extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
+          extract.sample_file :+= samFile
+          add(extract)
+          toPed.variant = extract.out
+        } else {
+          toPed.variant = variants
+        }
        toPed.metaData = meta
-        lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
-        lazy val tBed = new File(tmpdir,base+".bed")
-        lazy val bim = new File(tmpdir,base+".bim")
-        lazy val fam = new File(tmpdir,base+".fam")
+        toPed.memoryLimit = 2
+        val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
+        val tBed = new File(tmpdir,base+".bed")
+        val bim = new File(tmpdir,base+".bim")
+        val fam = new File(tmpdir,base+".fam")
        toPed.bed = tBed
        toPed.bim = bim
        toPed.fam = fam