diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 43d5f0b28..34f4bd607 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -17,16 +18,13 @@ import java.util.*; /** * Created by IntelliJ IDEA. - * User: rpoplin, lfran + * User: rpoplin, lfran, ebanks * Date: 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private Set trios = null; - private final static int REF = 0; - private final static int HET = 1; - private final static int HOM = 2; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { @@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen } } - final Map toRet = new HashMap(1); + final Map toRet = new HashMap(1); final HashSet triosToTest = new HashSet(); - for( final Sample child : trios) { + for( final Sample child : trios ) { final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); @@ -65,28 +63,54 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT private double calculateTDT( final VariantContext vc, final Set triosToTest ) { - final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET); - final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET); - final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); - final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); - final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF); - final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF); + double nABGivenABandBB = 0.0; + double nBBGivenABandBB = 0.0; + double nAAGivenABandAB = 0.0; + double nBBGivenABandAB = 0.0; + double nAAGivenAAandAB = 0.0; + double nABGivenAAandAB = 0.0; + + // for each pair of alleles, add the likelihoods + int numAlleles = vc.getNAlleles(); + for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) { + for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) { + + // TODO -- cache these for better performance + final int HOM1index = determineHomIndex(allele1, numAlleles); + final int HETindex = HOM1index + (allele2 - allele1); + final int HOM2index = determineHomIndex(allele2, numAlleles); + + nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex); + nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex); + nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex); + nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex); + nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index); + nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index); + } + } final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); return (numer * numer) / denom; } - private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) { + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int momIdx, final int dadIdx ) { final double likelihoodVector[] = new double[triosToTest.size()]; int iii = 0; for( final Sample child : triosToTest ) { final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); - likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx]; + likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx]; } return MathUtils.sumLog10(likelihoodVector); } + + private static int determineHomIndex(final int alleleIndex, int numAlleles) { + int result = 0; + for ( int i = 0; i < alleleIndex; i++ ) + result += numAlleles--; + return result; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java index 32b2dd06c..aab230b69 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -114,7 +114,7 @@ public class VariantsToPed extends RodWalker { String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala new file mode 100644 index 000000000..257fef021 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.queue.qscripts.lib + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals +import scala.collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines +import java.io.PrintStream +import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 2/2/12 + * Time: 12:13 PM + * To change this template use File | Settings | File Templates. + */ + +class ChunkVCF extends QScript { + + @Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true) + var inVCF : File = _ + + @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true) + var numEntries : Int = _ + + @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.") + var intervals : File = _ + + @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false) + var preserve : Boolean = false + + @Input(fullName="reference",doc="The reference file",required=false) + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false) + var extractSamples : File = _ + + val tmpdir : File = System.getProperty("java.io.tmpdir") + + def script = { + if ( intervals == null ) { + // create an interval list from the VCF + val ivals : File = swapExt(variants,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + add(extract) + } else { + var chunkNum = 1 + var numLinesInChunk = 0 + var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0) + var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + var chunkWriter = new PrintStream(chunkFile) + asScalaIterator(new XReadLines(intervals)).foreach( int => { + // check new chromosome or full chunk + if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) { + chunkWriter.close() + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + if ( extractSamples != null ) + chunkSelect.sample_file = extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + chunkNum += 1 + numLinesInChunk = 0 + chromosome = int.split(":")(0) + chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + chunkWriter = new PrintStream(chunkFile) + } + chunkWriter.printf("%s%n",int) + numLinesInChunk += 1 + }) + // last chunk + if ( numLinesInChunk > 0 ) { + // some work to do + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + chunkWriter.close() + if ( extractSamples != null ) + chunkSelect.sample_file = extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + } + } + } +} \ No newline at end of file diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 04f73d562..913a62e26 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -6,7 +6,7 @@ import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals import org.broadinstitute.sting.utils.text.XReadLines import collection.JavaConversions._ import java.io._ -import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed +import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed} /** * Created by IntelliJ IDEA. @@ -31,11 +31,14 @@ class VcfToPed extends QScript { var intervals : File = _ @Argument(shortName="R",fullName="Ref",required=false,doc="Reference file") - var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta") + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file") var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + @Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file") + var samFile : File = _ + val tmpdir : File = System.getProperty("java.io.tmpdir") def script = { @@ -59,14 +62,27 @@ class VcfToPed extends QScript { val toPed : VariantsToPed = new VariantsToPed toPed.memoryLimit = 2 toPed.reference_sequence = ref - toPed.intervals :+= new File(subListFile) + toPed.intervals :+= subListFile toPed.dbsnp = dbsnp - toPed.variant = variants + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } toPed.metaData = meta - lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) - lazy val tBed = new File(tmpdir,base+".bed") - lazy val bim = new File(tmpdir,base+".bim") - lazy val fam = new File(tmpdir,base+".fam") + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") toPed.bed = tBed toPed.bim = bim toPed.fam = fam @@ -87,12 +103,26 @@ class VcfToPed extends QScript { toPed.reference_sequence = ref toPed.intervals :+= new File(subListFile) toPed.dbsnp = dbsnp - toPed.variant = variants + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } toPed.metaData = meta - lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) - lazy val tBed = new File(tmpdir,base+".bed") - lazy val bim = new File(tmpdir,base+".bim") - lazy val fam = new File(tmpdir,base+".fam") + toPed.memoryLimit = 2 + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") toPed.bed = tBed toPed.bim = bim toPed.fam = fam