From 87a63d54d62fd639237f7755e7d6bc6498709469 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 1 Feb 2012 12:05:29 -0500 Subject: [PATCH 01/10] fix the script! --- .../sting/queue/qscripts/lib/VcfToPed.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 04f73d562..1c2620441 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -63,10 +63,10 @@ class VcfToPed extends QScript { toPed.dbsnp = dbsnp toPed.variant = variants toPed.metaData = meta - lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) - lazy val tBed = new File(tmpdir,base+".bed") - lazy val bim = new File(tmpdir,base+".bim") - lazy val fam = new File(tmpdir,base+".fam") + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") toPed.bed = tBed toPed.bim = bim toPed.fam = fam @@ -89,10 +89,10 @@ class VcfToPed extends QScript { toPed.dbsnp = dbsnp toPed.variant = variants toPed.metaData = meta - lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) - lazy val tBed = new File(tmpdir,base+".bed") - lazy val bim = new File(tmpdir,base+".bim") - lazy val fam = new File(tmpdir,base+".fam") + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val tBed = new File(tmpdir,base+".bed") + val bim = new File(tmpdir,base+".bim") + val fam = new File(tmpdir,base+".fam") toPed.bed = tBed toPed.bim = bim toPed.fam = fam From b567ed8793ad4af875e4ef4aab41c5ef89352fb5 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 1 Feb 2012 12:35:18 -0500 Subject: [PATCH 02/10] Use the right reference path :( --- .../org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 1c2620441..30b5cb084 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -31,7 +31,7 @@ class VcfToPed extends QScript { var intervals : File = _ @Argument(shortName="R",fullName="Ref",required=false,doc="Reference file") - var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta") + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file") var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") From f8c5406084281eacf927585a974c8f73b51d6158 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 09:06:39 -0500 Subject: [PATCH 03/10] Add the ability to extract samples --- .../sting/queue/qscripts/lib/VcfToPed.scala | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 30b5cb084..4995888bb 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -6,7 +6,7 @@ import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals import org.broadinstitute.sting.utils.text.XReadLines import collection.JavaConversions._ import java.io._ -import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed +import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed} /** * Created by IntelliJ IDEA. @@ -36,6 +36,9 @@ class VcfToPed extends QScript { @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file") var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + @Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file") + var samFile : File = _ + val tmpdir : File = System.getProperty("java.io.tmpdir") def script = { @@ -59,9 +62,22 @@ class VcfToPed extends QScript { val toPed : VariantsToPed = new VariantsToPed toPed.memoryLimit = 2 toPed.reference_sequence = ref - toPed.intervals :+= new File(subListFile) + toPed.intervals :+= subListFile toPed.dbsnp = dbsnp - toPed.variant = variants + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } toPed.metaData = meta val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) val tBed = new File(tmpdir,base+".bed") From 45bf2562cc2cde70c15868c0e2b14f5c50c5c911 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 09:11:17 -0500 Subject: [PATCH 04/10] . --- .../sting/queue/qscripts/lib/VcfToPed.scala | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 4995888bb..2f691b907 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -103,7 +103,20 @@ class VcfToPed extends QScript { toPed.reference_sequence = ref toPed.intervals :+= new File(subListFile) toPed.dbsnp = dbsnp - toPed.variant = variants + if ( samFile != null ) { + val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) + val extract : SelectVariants = new SelectVariants + extract.reference_sequence = ref + extract.memoryLimit = 2 + extract.intervals :+= subListFile + extract.variant = variants + extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk)) + extract.sample_file :+= samFile + add(extract) + toPed.variant = extract.out + } else { + toPed.variant = variants + } toPed.metaData = meta val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) val tBed = new File(tmpdir,base+".bed") From 0c562756eb5b96a36485abe40e3907940643b8b7 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 10:30:09 -0500 Subject: [PATCH 05/10] Add a memory limit so this thing doesn't get killed on the farm --- .../org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 2f691b907..913a62e26 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -118,6 +118,7 @@ class VcfToPed extends QScript { toPed.variant = variants } toPed.metaData = meta + toPed.memoryLimit = 2 val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk) val tBed = new File(tmpdir,base+".bed") val bim = new File(tmpdir,base+".bim") From 0111505ea9e2e3df4c8e5ce3553df45cd0e64eca Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 11:41:16 -0500 Subject: [PATCH 06/10] Terrible. Swapping the paternal and sample ids. --- .../sting/gatk/walkers/variantutils/VariantsToPed.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java index 32b2dd06c..aab230b69 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -114,7 +114,7 @@ public class VariantsToPed extends RodWalker { String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } } } From 27ea6426a43b9a1ad00105953e82d33e7196c217 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 12:29:03 -0500 Subject: [PATCH 07/10] Small script to chunk up a VCF into equal-sized chunks --- .../sting/queue/qscripts/lib/ChunkVCF.scala | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala new file mode 100644 index 000000000..257fef021 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.queue.qscripts.lib + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals +import scala.collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines +import java.io.PrintStream +import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 2/2/12 + * Time: 12:13 PM + * To change this template use File | Settings | File Templates. + */ + +class ChunkVCF extends QScript { + + @Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true) + var inVCF : File = _ + + @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true) + var numEntries : Int = _ + + @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.") + var intervals : File = _ + + @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false) + var preserve : Boolean = false + + @Input(fullName="reference",doc="The reference file",required=false) + var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false) + var extractSamples : File = _ + + val tmpdir : File = System.getProperty("java.io.tmpdir") + + def script = { + if ( intervals == null ) { + // create an interval list from the VCF + val ivals : File = swapExt(variants,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + add(extract) + } else { + var chunkNum = 1 + var numLinesInChunk = 0 + var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0) + var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + var chunkWriter = new PrintStream(chunkFile) + asScalaIterator(new XReadLines(intervals)).foreach( int => { + // check new chromosome or full chunk + if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) { + chunkWriter.close() + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + if ( extractSamples != null ) + chunkSelect.sample_file = extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + chunkNum += 1 + numLinesInChunk = 0 + chromosome = int.split(":")(0) + chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum)) + chunkWriter = new PrintStream(chunkFile) + } + chunkWriter.printf("%s%n",int) + numLinesInChunk += 1 + }) + // last chunk + if ( numLinesInChunk > 0 ) { + // some work to do + val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.reference_sequence = ref + chunkSelect.memoryLimit = 2 + chunkSelect.intervals :+= chunkFile + chunkWriter.close() + if ( extractSamples != null ) + chunkSelect.sample_file = extractSamples + chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) + add(chunkSelect) + } + } + } +} \ No newline at end of file From 3abfbcbcf2eb81394adf79072232967f8f6b069d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 3 Feb 2012 12:23:21 -0500 Subject: [PATCH 08/10] Generalized the TDT for multi-allelic events --- .../TransmissionDisequilibriumTest.java | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 43d5f0b28..34f4bd607 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -17,16 +18,13 @@ import java.util.*; /** * Created by IntelliJ IDEA. - * User: rpoplin, lfran + * User: rpoplin, lfran, ebanks * Date: 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private Set trios = null; - private final static int REF = 0; - private final static int HET = 1; - private final static int HOM = 2; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { @@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen } } - final Map toRet = new HashMap(1); + final Map toRet = new HashMap(1); final HashSet triosToTest = new HashSet(); - for( final Sample child : trios) { + for( final Sample child : trios ) { final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); @@ -65,28 +63,54 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT private double calculateTDT( final VariantContext vc, final Set triosToTest ) { - final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET); - final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET); - final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); - final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); - final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF); - final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF); + double nABGivenABandBB = 0.0; + double nBBGivenABandBB = 0.0; + double nAAGivenABandAB = 0.0; + double nBBGivenABandAB = 0.0; + double nAAGivenAAandAB = 0.0; + double nABGivenAAandAB = 0.0; + + // for each pair of alleles, add the likelihoods + int numAlleles = vc.getNAlleles(); + for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) { + for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) { + + // TODO -- cache these for better performance + final int HOM1index = determineHomIndex(allele1, numAlleles); + final int HETindex = HOM1index + (allele2 - allele1); + final int HOM2index = determineHomIndex(allele2, numAlleles); + + nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex); + nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex); + nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex); + nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex); + nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index); + nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index); + } + } final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); return (numer * numer) / denom; } - private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) { + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int momIdx, final int dadIdx ) { final double likelihoodVector[] = new double[triosToTest.size()]; int iii = 0; for( final Sample child : triosToTest ) { final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); - likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx]; + likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx]; } return MathUtils.sumLog10(likelihoodVector); } + + private static int determineHomIndex(final int alleleIndex, int numAlleles) { + int result = 0; + for ( int i = 0; i < alleleIndex; i++ ) + result += numAlleles--; + return result; + } }