Mostly small changes to my own scala scripts: .vcf.gz compatibility for output files, smarter beagle generation, simple script to scatter-gather combine variants. Whole genome indel calling now uses the gold standard indel set.

This commit is contained in:
Christopher Hartl 2012-02-22 17:20:04 -05:00
parent 9b61a398b3
commit 2c1b14d35e
2 changed files with 19 additions and 5 deletions

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -45,6 +46,9 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
@Output(shortName="fam",fullName="fam",required=true,doc="output fam file") @Output(shortName="fam",fullName="fam",required=true,doc="output fam file")
PrintStream outFam; PrintStream outFam;
@Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL")
int minGenotypeQuality = 0;
private ValidateVariants vv = new ValidateVariants(); private ValidateVariants vv = new ValidateVariants();
private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
@ -173,9 +177,11 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
return 0; return 0;
} }
private static byte getEncoding(Genotype g, int offset) { private byte getEncoding(Genotype g, int offset) {
byte b; byte b;
if ( g.isHomRef() ) { if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_REF; b = HOM_REF;
} else if ( g.isHomVar() ) { } else if ( g.isHomVar() ) {
b = HOM_VAR; b = HOM_VAR;

View File

@ -47,9 +47,14 @@ class VcfToPed extends QScript {
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
add(extract) add(extract)
} else { } else {
val IS_GZ : Boolean = variants.getName.endsWith(".vcf.gz")
var iXRL = new XReadLines(intervals) var iXRL = new XReadLines(intervals)
var chunk = 1; var chunk = 1;
var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) var subListFile : File = null
if ( IS_GZ )
subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk))
else
subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
var subList = new PrintStream(subListFile) var subList = new PrintStream(subListFile)
var nL = 0; var nL = 0;
var bedOuts : List[File] = Nil; var bedOuts : List[File] = Nil;
@ -58,7 +63,7 @@ class VcfToPed extends QScript {
while ( iXRL.hasNext ) { while ( iXRL.hasNext ) {
subList.printf("%s%n",iXRL.next()) subList.printf("%s%n",iXRL.next())
nL = nL + 1 nL = nL + 1
if ( nL > 100000 ) { if ( nL > 10000 ) {
val toPed : VariantsToPed = new VariantsToPed val toPed : VariantsToPed = new VariantsToPed
toPed.memoryLimit = 2 toPed.memoryLimit = 2
toPed.reference_sequence = ref toPed.reference_sequence = ref
@ -89,6 +94,9 @@ class VcfToPed extends QScript {
add(toPed) add(toPed)
subList.close() subList.close()
chunk = chunk + 1 chunk = chunk + 1
if ( IS_GZ )
subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk))
else
subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
subList = new PrintStream(subListFile) subList = new PrintStream(subListFile)
bedOuts :+= tBed bedOuts :+= tBed