Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
0e44430e47
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||||
|
|
@ -17,16 +18,13 @@ import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
* User: rpoplin, lfran
|
* User: rpoplin, lfran, ebanks
|
||||||
* Date: 11/14/11
|
* Date: 11/14/11
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||||
|
|
||||||
private Set<Sample> trios = null;
|
private Set<Sample> trios = null;
|
||||||
private final static int REF = 0;
|
|
||||||
private final static int HET = 1;
|
|
||||||
private final static int HOM = 2;
|
|
||||||
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
|
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
|
||||||
|
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Map<String,Object> toRet = new HashMap<String,Object>(1);
|
final Map<String, Object> toRet = new HashMap<String, Object>(1);
|
||||||
final HashSet<Sample> triosToTest = new HashSet<Sample>();
|
final HashSet<Sample> triosToTest = new HashSet<Sample>();
|
||||||
|
|
||||||
for( final Sample child : trios) {
|
for( final Sample child : trios ) {
|
||||||
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
|
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
|
||||||
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
|
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
|
||||||
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
|
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
|
||||||
|
|
@ -65,28 +63,54 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
||||||
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
|
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
|
||||||
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
||||||
|
|
||||||
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET);
|
double nABGivenABandBB = 0.0;
|
||||||
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET);
|
double nBBGivenABandBB = 0.0;
|
||||||
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
|
double nAAGivenABandAB = 0.0;
|
||||||
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
|
double nBBGivenABandAB = 0.0;
|
||||||
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF);
|
double nAAGivenAAandAB = 0.0;
|
||||||
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF);
|
double nABGivenAAandAB = 0.0;
|
||||||
|
|
||||||
|
// for each pair of alleles, add the likelihoods
|
||||||
|
int numAlleles = vc.getNAlleles();
|
||||||
|
for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) {
|
||||||
|
for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) {
|
||||||
|
|
||||||
|
// TODO -- cache these for better performance
|
||||||
|
final int HOM1index = determineHomIndex(allele1, numAlleles);
|
||||||
|
final int HETindex = HOM1index + (allele2 - allele1);
|
||||||
|
final int HOM2index = determineHomIndex(allele2, numAlleles);
|
||||||
|
|
||||||
|
nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex);
|
||||||
|
nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex);
|
||||||
|
nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex);
|
||||||
|
nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex);
|
||||||
|
nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index);
|
||||||
|
nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
||||||
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
||||||
return (numer * numer) / denom;
|
return (numer * numer) / denom;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) {
|
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int momIdx, final int dadIdx ) {
|
||||||
final double likelihoodVector[] = new double[triosToTest.size()];
|
final double likelihoodVector[] = new double[triosToTest.size()];
|
||||||
int iii = 0;
|
int iii = 0;
|
||||||
for( final Sample child : triosToTest ) {
|
for( final Sample child : triosToTest ) {
|
||||||
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
|
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
|
||||||
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
|
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
|
||||||
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
|
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
|
||||||
likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx];
|
likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx];
|
||||||
}
|
}
|
||||||
|
|
||||||
return MathUtils.sumLog10(likelihoodVector);
|
return MathUtils.sumLog10(likelihoodVector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int determineHomIndex(final int alleleIndex, int numAlleles) {
|
||||||
|
int result = 0;
|
||||||
|
for ( int i = 0; i < alleleIndex; i++ )
|
||||||
|
result += numAlleles--;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,7 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
||||||
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
|
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
|
||||||
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
||||||
String pheno = mVals.get("phenotype");
|
String pheno = mVals.get("phenotype");
|
||||||
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
|
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.queue.qscripts.lib
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||||
|
import scala.collection.JavaConversions._
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines
|
||||||
|
import java.io.PrintStream
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 2/2/12
|
||||||
|
* Time: 12:13 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class ChunkVCF extends QScript {
|
||||||
|
|
||||||
|
@Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true)
|
||||||
|
var inVCF : File = _
|
||||||
|
|
||||||
|
@Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true)
|
||||||
|
var numEntries : Int = _
|
||||||
|
|
||||||
|
@Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.")
|
||||||
|
var intervals : File = _
|
||||||
|
|
||||||
|
@Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false)
|
||||||
|
var preserve : Boolean = false
|
||||||
|
|
||||||
|
@Input(fullName="reference",doc="The reference file",required=false)
|
||||||
|
var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||||
|
|
||||||
|
@Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false)
|
||||||
|
var extractSamples : File = _
|
||||||
|
|
||||||
|
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
if ( intervals == null ) {
|
||||||
|
// create an interval list from the VCF
|
||||||
|
val ivals : File = swapExt(variants,".vcf",".intervals.list")
|
||||||
|
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
|
||||||
|
add(extract)
|
||||||
|
} else {
|
||||||
|
var chunkNum = 1
|
||||||
|
var numLinesInChunk = 0
|
||||||
|
var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0)
|
||||||
|
var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||||
|
var chunkWriter = new PrintStream(chunkFile)
|
||||||
|
asScalaIterator(new XReadLines(intervals)).foreach( int => {
|
||||||
|
// check new chromosome or full chunk
|
||||||
|
if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) {
|
||||||
|
chunkWriter.close()
|
||||||
|
val chunkSelect : SelectVariants = new SelectVariants
|
||||||
|
chunkSelect.reference_sequence = ref
|
||||||
|
chunkSelect.memoryLimit = 2
|
||||||
|
chunkSelect.intervals :+= chunkFile
|
||||||
|
if ( extractSamples != null )
|
||||||
|
chunkSelect.sample_file = extractSamples
|
||||||
|
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||||
|
add(chunkSelect)
|
||||||
|
chunkNum += 1
|
||||||
|
numLinesInChunk = 0
|
||||||
|
chromosome = int.split(":")(0)
|
||||||
|
chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||||
|
chunkWriter = new PrintStream(chunkFile)
|
||||||
|
}
|
||||||
|
chunkWriter.printf("%s%n",int)
|
||||||
|
numLinesInChunk += 1
|
||||||
|
})
|
||||||
|
// last chunk
|
||||||
|
if ( numLinesInChunk > 0 ) {
|
||||||
|
// some work to do
|
||||||
|
val chunkSelect : SelectVariants = new SelectVariants
|
||||||
|
chunkSelect.reference_sequence = ref
|
||||||
|
chunkSelect.memoryLimit = 2
|
||||||
|
chunkSelect.intervals :+= chunkFile
|
||||||
|
chunkWriter.close()
|
||||||
|
if ( extractSamples != null )
|
||||||
|
chunkSelect.sample_file = extractSamples
|
||||||
|
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||||
|
add(chunkSelect)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -6,7 +6,7 @@ import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||||
import org.broadinstitute.sting.utils.text.XReadLines
|
import org.broadinstitute.sting.utils.text.XReadLines
|
||||||
import collection.JavaConversions._
|
import collection.JavaConversions._
|
||||||
import java.io._
|
import java.io._
|
||||||
import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
|
import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -31,11 +31,14 @@ class VcfToPed extends QScript {
|
||||||
var intervals : File = _
|
var intervals : File = _
|
||||||
|
|
||||||
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
|
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
|
||||||
var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
|
var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||||
|
|
||||||
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
|
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
|
||||||
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||||
|
|
||||||
|
@Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file")
|
||||||
|
var samFile : File = _
|
||||||
|
|
||||||
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||||
|
|
||||||
def script = {
|
def script = {
|
||||||
|
|
@ -59,14 +62,27 @@ class VcfToPed extends QScript {
|
||||||
val toPed : VariantsToPed = new VariantsToPed
|
val toPed : VariantsToPed = new VariantsToPed
|
||||||
toPed.memoryLimit = 2
|
toPed.memoryLimit = 2
|
||||||
toPed.reference_sequence = ref
|
toPed.reference_sequence = ref
|
||||||
toPed.intervals :+= new File(subListFile)
|
toPed.intervals :+= subListFile
|
||||||
toPed.dbsnp = dbsnp
|
toPed.dbsnp = dbsnp
|
||||||
toPed.variant = variants
|
if ( samFile != null ) {
|
||||||
|
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||||
|
val extract : SelectVariants = new SelectVariants
|
||||||
|
extract.reference_sequence = ref
|
||||||
|
extract.memoryLimit = 2
|
||||||
|
extract.intervals :+= subListFile
|
||||||
|
extract.variant = variants
|
||||||
|
extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
|
||||||
|
extract.sample_file :+= samFile
|
||||||
|
add(extract)
|
||||||
|
toPed.variant = extract.out
|
||||||
|
} else {
|
||||||
|
toPed.variant = variants
|
||||||
|
}
|
||||||
toPed.metaData = meta
|
toPed.metaData = meta
|
||||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||||
lazy val tBed = new File(tmpdir,base+".bed")
|
val tBed = new File(tmpdir,base+".bed")
|
||||||
lazy val bim = new File(tmpdir,base+".bim")
|
val bim = new File(tmpdir,base+".bim")
|
||||||
lazy val fam = new File(tmpdir,base+".fam")
|
val fam = new File(tmpdir,base+".fam")
|
||||||
toPed.bed = tBed
|
toPed.bed = tBed
|
||||||
toPed.bim = bim
|
toPed.bim = bim
|
||||||
toPed.fam = fam
|
toPed.fam = fam
|
||||||
|
|
@ -87,12 +103,26 @@ class VcfToPed extends QScript {
|
||||||
toPed.reference_sequence = ref
|
toPed.reference_sequence = ref
|
||||||
toPed.intervals :+= new File(subListFile)
|
toPed.intervals :+= new File(subListFile)
|
||||||
toPed.dbsnp = dbsnp
|
toPed.dbsnp = dbsnp
|
||||||
toPed.variant = variants
|
if ( samFile != null ) {
|
||||||
|
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||||
|
val extract : SelectVariants = new SelectVariants
|
||||||
|
extract.reference_sequence = ref
|
||||||
|
extract.memoryLimit = 2
|
||||||
|
extract.intervals :+= subListFile
|
||||||
|
extract.variant = variants
|
||||||
|
extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
|
||||||
|
extract.sample_file :+= samFile
|
||||||
|
add(extract)
|
||||||
|
toPed.variant = extract.out
|
||||||
|
} else {
|
||||||
|
toPed.variant = variants
|
||||||
|
}
|
||||||
toPed.metaData = meta
|
toPed.metaData = meta
|
||||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
toPed.memoryLimit = 2
|
||||||
lazy val tBed = new File(tmpdir,base+".bed")
|
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||||
lazy val bim = new File(tmpdir,base+".bim")
|
val tBed = new File(tmpdir,base+".bed")
|
||||||
lazy val fam = new File(tmpdir,base+".fam")
|
val bim = new File(tmpdir,base+".bim")
|
||||||
|
val fam = new File(tmpdir,base+".fam")
|
||||||
toPed.bed = tBed
|
toPed.bed = tBed
|
||||||
toPed.bim = bim
|
toPed.bim = bim
|
||||||
toPed.fam = fam
|
toPed.fam = fam
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue