Somewhat major update. Changes:

- ProduceBeagleInputWalker
 + Now takes a validation ROD and a prior to give it, will use those genotypes in place of the variant genotypes if both are present
 + Takes a bootstrap argument -- can use some given %age of the validation sites
 + Optionally takes a bootstrap output argument -- re-prints the validation VCF, filtering those sites used as part of the bootstrap
-BeagleOutputToVCFWalker
 + Now filters sites where the genotypes have been reverted to hom ref
 + Now calls in to the new VCUtils to calculate AC/AN

-Queue
 + New pipeline libraries for easy qscript creation, still a work in progress, but this is a considerable prototype
 + full calling pipeline v2 uses the above libraries
 + minor changes to some of my own scripts
 + no more need for contig interval lists, these will be parsed out of your normal interval list when it is provided



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4459 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-10-08 13:30:28 +00:00
parent e02f837659
commit 21ec44339d
12 changed files with 1053 additions and 131 deletions

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -161,6 +162,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
int numGenotypesChangedByBeagle = 0;
Integer alleleCountH = 0, chrCountH = 0;
Double alleleFrequencyH = 0.0;
int beagleVarCounts = 0;
Map<String,Genotype> hapmapGenotypes = null;
@ -284,31 +286,26 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
beagleVarCounts++;
}
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
}
VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
StringBuffer altAlleleCountString = new StringBuffer();
for ( Allele allele : altAlleles ) {
if ( altAlleleCountString.length() > 0 )
altAlleleCountString.append(",");
altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
VariantContext filteredVC;
if ( beagleVarCounts > 0)
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
else {
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
removedFilters.add(String.format("BGL_RM_WAS_%s/%s",vc_input.getReference().getBaseString(),vc_input.getAlternateAllele(0)));
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
}
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
if ( filteredVC.getChromosomeCount() > 0 ) {
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount()));
if ( altAlleleCountString.length() > 0 ) {
attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString());
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount())));
}
}
// re-compute chromosome counts
VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);
// Get Hapmap AC and AF
if (vc_comp != null) {
@ -322,8 +319,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
attributes.put("R2", beagleR2Feature.getR2value().toString() );
vcfWriter.add(VariantContext.modifyAttributes(filteredVC, attributes), ref.getBase());
vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase());
return 1;

View File

@ -28,13 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.beagle;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFWriter;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
@ -42,6 +43,8 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.PrintStream;
import java.util.*;
@ -53,6 +56,7 @@ import java.util.*;
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public static final String ROD_NAME = "variant";
public static final String VALIDATION_ROD_NAME = "validation";
@Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null;
@ -61,9 +65,24 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public PrintStream beagleGenotypesWriter = null;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
public double validationPrior = -1.0;
@Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
public double bootstrap = 0.0;
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
@Hidden
@Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
public double variantPrior = 0.96;
private Set<String> samples = null;
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
private Random generator;
private int bootstrapCount = -1;
private int bootstrapSetSize = 0;
private int testSetSize = 0;
public void initialize() {
generator = new Random();
@ -77,114 +96,176 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println("dummy header");
if ( bootstrapVCFOutput != null ) {
initializeVcfWriter();
}
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) {
if( tracker != null ) {
GenomeLoc loc = context.getLocation();
VariantContext vc_eval;
VariantContext variant_eval;
VariantContext validation_eval;
vc_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
if ( vc_eval == null || vc_eval.isFiltered() )
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false);
if ( goodSite(variant_eval,validation_eval) ) {
if ( useValidation(variant_eval,validation_eval, ref) ) {
writeBeagleOutput(validation_eval,variant_eval,true,validationPrior);
} else {
if ( goodSite(variant_eval) ) {
writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
return 1;
} else { // todo -- if the variant site is bad, validation is good, but not in bootstrap set -- what do?
return 0;
}
}
return 1;
} else {
return 0;
if (!vc_eval.isSNP())
return 0;
if (!vc_eval.isBiallelic())
return 0;
// output marker ID to Beagle input file
beagleWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
for (Allele allele: vc_eval.getAlleles()) {
// TODO -- check whether this is really needed by Beagle
String bglPrintString;
if (allele.isNoCall() || allele.isNull())
bglPrintString = "0";
else
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
beagleWriter.print(String.format("%s ", bglPrintString));
}
if ( !vc_eval.hasGenotypes() )
return 0;
Map<String, Genotype> genotypes = vc_eval.getGenotypes();
for ( String sample : samples ) {
// use sample as key into genotypes structure
Genotype genotype = genotypes.get(sample);
if (genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
double[] likeArray = new double[glArray.length];
// convert to double array so we can normalize
int k=0;
for (String gl : glArray) {
likeArray[k++] = Double.valueOf(gl);
}
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
// see if we need to randomly mask out genotype in this position.
Double d = generator.nextDouble();
if (d > insertedNoCallRate ) {
// System.out.format("%5.4f ", d);
for (Double likeVal: normalizedLikelihoods)
beagleWriter.print(String.format("%5.4f ",likeVal));
}
else {
// we are masking out this genotype
beagleWriter.print("0.33 0.33 0.33 ");
}
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
}
else if (genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
// hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype
// is confident. This is useful for Hapmap and 1KG release VCFs.
double AA = 0.02;
double AB = 0.02;
double BB = 0.02;
if (genotype.isHomRef()) { AA = 0.96; }
else if (genotype.isHet()) { AB = 0.96; }
else if (genotype.isHomVar()) { BB = 0.96; }
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
}
else {
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(". . ");
}
}
beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println();
} else {
return 0;
}
return 1;
}
public boolean goodSite(VariantContext a, VariantContext b) {
return goodSite(a) || goodSite(b);
}
public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes();
}
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
if( goodSite(validation) ) {
// if using record keeps us below expected proportion, use it
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
if ( bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() );
}
bootstrapSetSize ++;
return true;
} else {
if ( bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(validation,ref.getBase());
}
testSetSize++;
return false;
}
} else {
if ( validation != null && bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(validation,ref.getBase());
}
return false;
}
}
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
if ( beagleGenotypesWriter != null ) {
beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
}
for ( Allele allele : preferredVC.getAlleles() ) {
String bglPrintString;
if (allele.isNoCall() || allele.isNull())
bglPrintString = "0";
else
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
beagleWriter.print(String.format("%s ", bglPrintString));
}
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
boolean isValidation;
for ( String sample : samples ) {
Genotype genotype;
// use sample as key into genotypes structure
if ( preferredGenotypes.keySet().contains(sample) ) {
genotype = preferredGenotypes.get(sample);
isValidation = isValidationSite;
} else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
genotype = otherGenotypes.get(sample);
isValidation = ! isValidationSite;
} else {
// there is magically no genotype for this sample.
throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
}
/**
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
*/
if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
double[] likeArray = new double[glArray.length];
// convert to double array so we can normalize
int k=0;
for (String gl : glArray) {
likeArray[k++] = Double.valueOf(gl);
}
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
// see if we need to randomly mask out genotype in this position.
Double d = generator.nextDouble();
if (d > insertedNoCallRate ) {
// System.out.format("%5.4f ", d);
for (Double likeVal: normalizedLikelihoods)
beagleWriter.print(String.format("%5.4f ",likeVal));
}
else {
// we are masking out this genotype
beagleWriter.print("0.33 0.33 0.33 ");
}
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
}
/**
* otherwise, use the prior uniformly
*/
else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
// hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype
// is confident. This is useful for Hapmap and 1KG release VCFs.
double AA = (1.0-prior)/2.0;
double AB = (1.0-prior)/2.0;
double BB = (1.0-prior)/2.0;
if (genotype.isHomRef()) { AA = prior; }
else if (genotype.isHet()) { AB = prior; }
else if (genotype.isHomVar()) { BB = prior; }
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
}
else {
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(". . ");
}
}
beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println();
}
public Integer reduceInit() {
return 0; // Nothing to do here
}
@ -195,6 +276,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public void onTraversalDone( Integer sum ) {
}
}
private void initializeVcfWriter() {
final ArrayList<String> inputNames = new ArrayList<String>();
inputNames.add( VALIDATION_ROD_NAME );
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFFilterHeaderLine("bootstrap","This site used for genotype bootstrapping with ProduceBeagleInputWalker"));
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
}
}

View File

@ -54,4 +54,27 @@ public class BeagleIntegrationTest extends WalkerTest {
executeTest("test BeagleInput", spec);
}
@Test
public void testBeagleInput2() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+
"-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+
"-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2,
Arrays.asList("44d28b6b092d5f4c0ae59af442612ea3","481c58f8309916184a33ab1835e5cc48"));
executeTest("test BeagleInputWithBootstrap",spec);
}
@Test
public void testBeagleOutput2() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T BeagleOutputToVCF -R "+hg19Reference+" "+
"-B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+
"-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
"-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("fd104b7da9e6c99b662c6ad746fd53f3"));
executeTest("testBeagleChangesSitesToRef",spec);
}
}

View File

@ -0,0 +1,122 @@
import com.sun.tools.doclets.internal.toolkit.util.DocFinder.Input
import org.broadinstitute.sting.gatk.{CommandLineGATK, DownsampleType}
import java.io.File
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.commandline.Argument
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue.pipeline.{BamProcessing,VariantCalling}
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class fullCallingPipelineV2 extends QScript {
qscript =>
@Argument(doc="Number of cleaning jobs", shortName="cleaningJobs")
var cleaningJobs: Int = _
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
var yamlFile: File = _
@Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
var trigger: File = _
@Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
var refseqTable: File = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false)
var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to GATK jar")
var gatkJar: File = _
@Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
var target_titv: Float = _
@Input(doc="per-sample downsampling level",shortName="dcov",required=false)
var downsampling_coverage = 300
@Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
var num_snp_scatter_jobs = 20
@Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
var num_indel_scatter_jobs = 5
@Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
var skip_cleaning = false
//@Input(doc="ADPR script")
//var adprScript: File = _
//@Input(doc="Sequencing maching name (for use by adpr)")
//var machine: String = _
//@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
//var protocol: String = _
private var pipeline: Pipeline = _
trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals = qscript.pipeline.getProject.getIntervalList
this.jarFile = qscript.gatkJar
this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
this.memoryLimit = Some(4)
}
// ------------ SETUP THE PIPELINE ----------- //
def script = {
pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
var callingLib: VariantCalling = new VariantCalling(qscript.yamlFile,qscript.gatkJar)
var cleaningLib: BamProcessing = new BamProcessing(qscript.yamlFile,qscript.gatkJar,qscript.picardFixMatesJar)
val projectBase: String = qscript.pipeline.getProject.getName
val cleanedBase: String = projectBase + ".cleaned"
val uncleanedBase: String = projectBase + ".uncleaned"
// there are commands that use all the bam files
val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
var bamsToClean: List[(File,File)] = Nil
var recalBams: List[File] = Nil
var cleanedBams: List[File] = Nil
for ( sample <- recalibratedSamples ) {
val bam = sample.getBamFiles.get("recalibrated")
recalBams :+= bam
if (!sample.getBamFiles.contains("cleaned")) {
sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam"))
bamsToClean :+= (bam,sample.getBamFiles.get("cleaned"))
}
cleanedBams :+= sample.getBamFiles.get("cleaned")
}
if ( !qscript.skip_cleaning ) {
cleaningLib.StandardIndelRealign(bamsToClean,qscript.cleaningJobs)
}
if (!qscript.skip_cleaning) {
endToEnd(cleanedBase, cleanedBams, lib)
} else {
endToEnd(uncleanedBase, recalBams, lib)
}
}
def endToEnd(base: String, bamFiles: List[File], lib: VariantCalling) = {
var recal_vcf = base+"_snps.recal.annotated.tranched.vcf"
var handfilt_vcf = base+"_snps.handfiltered.annotated.vcf"
var indel_vcf = base+"_indel_calls.vcf"
for ( c <- lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,qscript.target_titv,qscript.refseqTable) ) {
add(c)
}
}
}

View File

@ -0,0 +1,124 @@
import java.io.{FileReader, File, BufferedReader}
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class omni_bootstrap_refine extends QScript {
=> script
val ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val gatkJar = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
trait FooTrait extends CommandLineGATK {
this.reference_sequence = script.ref
this.jarFile = script.gatkJar
this.intervalsString :+= "20"
}
class SampleOverlap extends CommandLineFunction {
@Input(doc="omni vcf") var omni: File = _
@Input(doc="other vcf") var other: File = _
@Output(doc="output sn file") var out : File = _
def commandLine = "%s ; %s ; %s ; %s".format(this.cmd1, this.cmd2, this.cmd3, this.cmd4)
def cmd1 : String = {
return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n' | sort > temp1".format(omni.getAbsolutePath)
}
def cmd2 : String = {
return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n'| sort > temp2".format(other.getAbsolutePath)
}
def cmd3 : String = {
return "cat temp1 temp2 | sort | uniq -c | awk '{if($1 == 2) print $2}' > %s".format(out.getAbsolutePath)
}
def cmd4 : String = {
return "rm temp1 temp2"
}
}
class BeagleRefinement extends CommandLineFunction {
@Input(doc="The beagle input file") var beagleInput: File = _
var beagleOutputBase: String = _
var beagleMemoryGigs: Int = 4
/**
* Note: These get set
*/
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
var beagleOutputDir: String = _
def freezeOutputs = {
if ( beagleInput.getParent == null ) {
beagleOutputDir = ""
} else {
beagleOutputDir = beagleInput.getParent
}
beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz")
beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2")
}
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,beagleOutputBase)
}
def runme(pop: File, omni: File, base: String) : List[CommandLineFunction] = {
var clf : List[CommandLineFunction] = Nil
val s_overlap = new File(base+"_sample_overlap.txt")
var calcOverlap = new SampleOverlap
calcOverlap.omni = omni
calcOverlap.other = pop
calcOverlap.out = s_overlap
clf += calcOverlap
val subset_omni = swapExt(omni,".vcf","_%s_subset.vcf".format(base))
val subset_pop = swapExt(pop,".vcf","_%s_subset.vcf".format(base))
var omSubset = new SelectVariants with FooTrait
omSubset.variantVCF = omni
omSubset.sample = s_overlap.getAbsolutePath
omSubset.out = subset_omni
clf += omSubset
var popSubset = new SelectVariants with FooTrait
popSubset.variantVCF = pop
popSubset.sample = s_overlap.getAbsolutePath
popSubset.out = subset_pop
clf += popSubset
var bootRefine = new ProduceBeagleInput with FooTrait
bootRefine.variantVCF = popSubset
bootRefine.rodBind :+= new RodBind("validation","VCF",subset_omni)
bootRefine.bootstrap_vcf = swapExt(subset_omni,".vcf",".boot.vcf")
bootRefine.bootstrap = Some(2)
bootRefine.validation_genotype_ptrue = Some(0.98);
bootRefine.out = new File(base+".beagle_input.beagle")
clf += bootRefine
var runBeagle = new BeagleRefinement
runBeagle.beagleInput = bootRefine.out
runBeagle.beagleOutputBase = base+"_beagle"
runBeagle.freezeOutputs()
clf += runBeagle
var putBack = new BeagleOutputToVCF with FooTrait
putBack.
}
}

View File

@ -14,22 +14,28 @@ import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class omni_qc extends QScript {
qscript =>
var pilot3_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot3_overlap.txt")
var pilot1_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot1_overlap.txt")
var production_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/production_overlap.txt")
var hiseq_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878_name.txt")
// NON-OMNI VCF FILES
var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf")
var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var august_calls_vcf = null
var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")
// OMNI VCF FILES
var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf")
var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf")
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf")
// INTERVALS
var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
var pilot1_interval_list: String = _
var hiseq_interval_list: String = _
var production_interval_list: String = "20"
val Omni_chip_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.vcf.gz","vcf")
val Omni_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.b37.vcf.gz","vcf")
val pilot3_vcf_broad = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.exon.2010_05.broad.genotypes.vcf.gz","vcf")
val pilot1_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
val production_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.production.2010_08.chr20.lowpass.genotypes.vcf","vcf")
val hiSeq_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
// OTHER
var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt")
trait OmniArgs extends CommandLineGATK {
this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta")
@ -80,6 +86,10 @@ class omni_qc extends QScript {
eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval")
eval.reportType = Some(VE2TemplateType.R)
eval.analysisName = base+"_Eval"
eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5")
eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+")
eval.disI = true
eval.outputVCF = outDir+base+".eval.interesting.sites.vcf"
if ( base.equalsIgnoreCase("production") ) {
subset.variantVCF = Omni_b37_vcf
@ -88,6 +98,8 @@ class omni_qc extends QScript {
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
}
add(subset,makeOmniSiteList,eval)
}
@ -96,5 +108,6 @@ class omni_qc extends QScript {
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3")
runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq")
runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production")
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release")
}
}
}

View File

@ -38,4 +38,5 @@ trait QScript extends Logging {
* Adds one or more command line functions to be run.
*/
def add(functions: CommandLineFunction*) = this.functions ++= List(functions:_*)
}

View File

@ -12,6 +12,9 @@ import org.broadinstitute.sting.queue.function.scattergather.{SimpleTextGatherFu
trait CommandLineFunction extends QFunction with Logging {
def commandLine: String
/* Is it a gather function? */
var isGather: Boolean = true
/** Upper memory limit */
var memoryLimit: Option[Int] = None

View File

@ -0,0 +1,138 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.extensions.gatk._
import java.io.File
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import net.sf.picard.reference.ReferenceSequenceFileFactory
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import org.broadinstitute.sting.utils.interval.IntervalUtils
import collection.mutable.{ListBuffer, HashMap}
import collection.JavaConversions
import java.util.Arrays
import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
import org.broadinstitute.sting.commandline.{Output, Input}
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
class BamProcessing(yaml: File, gatkJar: File, fixMatesJar: File) {
library =>
var attributes: Pipeline = YamlUtils.load(classOf[Pipeline],yaml)
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = library.attributes.getProject.getReferenceFile
this.intervals = library.attributes.getProject.getIntervalList
this.DBSNP = library.attributes.getProject.getDbsnpFile
this.memoryLimit = Some(2)
this.jarFile = library.gatkJar
}
/**
* @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
* @Returns: A CLF for the realigner target creator job
*/
protected def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
var rtc = new RealignerTargetCreator with StandardCommandLineGATK
rtc.intervals = null
rtc.intervalsString = contigs
rtc.input_file :+= bam
rtc.out = output
rtc.analysisName = "RealignerTargetCreator"
return rtc
}
/**
* @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
* @Returns: A CLF for the indel cleaning job
*/
protected def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
var realigner = new IndelRealigner with StandardCommandLineGATK
realigner.intervalsString = contigs
realigner.intervals = null
realigner.input_file :+= bam
realigner.out = outBam
realigner.targetIntervals = targets
realigner.analysisName = "IndelClean"
realigner.bam_compression = Some(0)
return realigner
}
/**
* @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
* @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
*/
protected def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
var cmds : List[CommandLineFunction] = Nil
var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
)
var bamsToMerge : List[File] = Nil
for ( spec <- jobSpecs ) {
cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
bamsToMerge :+= spec._2
}
cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
return cmds
}
/**
* @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
* command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
* of command line functions
* @Returns: A list of command line functions for the full indel realignment pipeline from the collection
* of uncleaned bams to the collection of cleaned bams
*/
protected def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
var commands : List[CommandLineFunction] = Nil
for ( bamPair <- bamsUncleanCleanPairs ) {
val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
sam.bamFile = bamPair._2
sam.analysisName = "SamtoolsIndex"
commands :+= rtc
commands ++= icbs
commands :+= sam
}
return commands
}
/**
* @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
* @Returns: Command line function for the merge, fix-mate, and sort operation
*/
protected def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
var pfm : PicardFixMates = new PicardFixMates
pfm.bams = inBams
pfm.outBam = outBam
pfm.jarFile = picardJar
pfm.assumeSorted = Some(false)
pfm.memoryLimit = Some(4)
pfm.analysisName = "FixMates"
return pfm
}
class PicardFixMates extends PicardBamJarFunction {
@Input(doc="input bam files") var bams: List[File] = Nil
@Output(doc="output bam file") var outBam: File = null
def inputBams: List[File] = bams
def outputBam: File = outBam
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}

View File

@ -0,0 +1,13 @@
package org.broadinstitute.sting.queue.pipeline
import org.broadinstitute.sting.queue.function.CommandLineFunction
trait QPipeline {
var commands: List[CommandLineFunction] = Nil
def generateCommands
def track // todo -- implement me
def removeIntermediate // todo -- implement me
}

View File

@ -0,0 +1,340 @@
package org.broadinstitute.sting.queue.pipeline
import java.io.File
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class VariantCalling(yaml: File,gatkJar: File) {
vc =>
// load attributes
var attributes: Pipeline = YamlUtils.load(classOf[Pipeline], yaml)
/**
* Trait to propagate basic attributes throughout the library
*/
trait StandardCommandLineGATK extends CommandLineGATK {
this.reference_sequence = vc.attributes.getProject.getReferenceFile
this.intervals = vc.attributes.getProject.getIntervalList
this.DBSNP = vc.attributes.getProject.getDbsnpFile
// set global memory limit on the low side. Additional input bams will affect it.
this.memoryLimit = Some(2)
this.jarFile = vc.gatkJar
}
/**
* @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
* @Return: UnifiedGenotyper with the standard GSA arguments
* @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
*/
protected def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
var ug = new UnifiedGenotyper with StandardCommandLineGATK
ug.analysisName = "UnifiedGenotyper"
ug.input_file = bams
ug.group :+= "Standard"
ug.out = output
ug.min_base_quality_score = Some(10)
ug.min_mapping_quality_score = Some(10)
ug.cap_base_quality_by_mapping_quality = true
ug.standard_min_confidence_threshold_for_emitting = Some(10)
ug.standard_min_confidence_threshold_for_calling = Some(30)
ug.trigger_min_confidence_threshold_for_calling = Some(0)
ug.downsample_to_coverage = Some(300)
ug.dt = Some(DownsampleType.BY_SAMPLE)
ug.scatterCount = 50
if ( bams.size > 125 ) {
ug.memoryLimit = Some(4)
}
return ug
}
/**
* @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
*/
protected def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
var ig = new IndelGenotyperV2 with StandardCommandLineGATK
ig.analysisName = "IndelGenotyper"
ig.input_file :+= bam
ig.out = output
ig.downsample_to_coverage = Some(300)
return ig
}
/**
* @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
* @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
*/
private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
var ig = StandardIndelGenotyper(bam,output)
ig.isGather = gather
return ig
}
/**
* @Doc: Combines a list of indel VCFs to a single output file
* @Returns: A CombineVariants CLF with standard GSA arguments
*/
protected def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
var cv = new CombineVariants with StandardCommandLineGATK
cv.out = output
cv.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY)
cv.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
cv.analysisName = "IndelGenotyper"
cv.isGather = true
cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
//cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
return cv
}
/**
* @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
* @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
*/
protected def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
var combine = this.StandardIndelCombine( genotypers, output )
var callFunctions: List[CommandLineGATK] = genotypers
callFunctions = combine :: callFunctions
return callFunctions
}
protected def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
var iFil = new VariantFiltration with StandardCommandLineGATK
iFil.analysisName = "FilterAtIndels"
iFil.out = output
iFil.mask = indels.getAbsolutePath
iFil.maskName = "Indel_Mask"
iFil.variantVCF = snps
// todo -- cluster size varies with # bams
iFil.clusterSize = Some(5)
iFil.clusterWindowSize = Some(8)
return iFil
}
protected def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
var hFil = new VariantFiltration with StandardCommandLineGATK
hFil.analysisName = "HandFilter"
hFil.out = output
hFil.variantVCF = snps
hFil.filterExpression :+= "QD<5"
hFil.filterName :+= "LowQualByDepth"
hFil.filterExpression :+= "AB>0.75"
hFil.filterName :+= "HighAlleleBalance"
hFil.filterExpression :+= "SB>-0.10"
hFil.filterName :+= "HighStrandBias"
return hFil
}
protected def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
var genC = new GenerateVariantClusters with StandardCommandLineGATK
genC.analysisName = "VariantQualityRecalibration"
genC.rodBind :+= new RodBind("input","VCF",snps)
genC.cluster_file = output
genC.use_annotation :+= "QD"
genC.use_annotation :+= "SB"
genC.use_annotation :+= "HaplotypeScore"
genC.use_annotation :+= "AB"
genC.use_annotation :+= "HRun"
return genC
}
protected def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
out_tranches: File, out_dat: File) : VariantRecalibrator = {
var vr = new VariantRecalibrator with StandardCommandLineGATK
vr.analysisName = "VariantQualityRecalibration"
vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
vr.cluster_file = cluster
vr.target_titv = target_titv
vr.out = out_vcf
vr.tranches_file = out_tranches
vr.report_dat_file = out_dat
vr.tranche :+= "0.1"
vr.tranche :+= "1"
vr.tranche :+= "5"
vr.tranche :+= "10"
return vr
}
protected def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
var avc = new ApplyVariantCuts with StandardCommandLineGATK
avc.analysisName = "VariantQualityRecalibration"
avc.rodBind :+= new RodBind("input","VCF",snpRecal)
avc.out = output
avc.tranches_file = tranches
avc.fdr_filter_level = Some(5)
return avc
}
protected def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
swapExt(snps,".vcf",".recal.tranch"),swapExt(snps,".vcf",".recal.dat"))
var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,recal.out)
var cmds: List[CommandLineGATK] = Nil
cmds :+= clust
cmds :+= recal
cmds :+= cut
return cmds
}
protected def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
var ga = new GenomicAnnotator with StandardCommandLineGATK
ga.analysisName = "GenomicAnnotator"
ga.variantVCF = snps
ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
ga.rodToIntervalTrackName = "variant"
ga.out = outputVCF
return ga
}
protected def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
var commands : List[CommandLineGATK] = Nil
var dir = ""
if ( output.getParent != null ) {
dir = output.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
var ig = StandardIndelCalls(bams,raw_indel)
commands ++= ig
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
commands ++= recal
return commands
}
protected def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
var commands : List[CommandLineGATK] = Nil
var dir = ""
if ( recalOut.getParent != null ) {
dir = recalOut.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
var ig = StandardIndelCalls(bams,raw_indel)
commands ++= ig
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
commands ++= recal
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
commands :+= handFilt
return commands
}
protected def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
var commands : List[CommandLineGATK] = Nil
var dir = ""
if ( recalOut.getParent != null ) {
dir = recalOut.getParent+"/"
}
var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
var ug = StandardUnifiedGenotyper(bams, raw_snp)
commands :+= ug
var raw_indel = indelOut
var ig = StandardIndelCalls(bams,raw_indel)
commands ++= ig
var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
commands :+= iFilt
var annoSNP = prefilt_snp
if ( refGene != null ) {
annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
commands :+= annotate
}
var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
commands ++= recal
var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
commands :+= handFilt
return commands
}
def swapExt(file: File, oldExtension: String, newExtension: String) =
new File(file.getName.stripSuffix(oldExtension) + newExtension)
}

View File

@ -0,0 +1,55 @@
package org.broadinstitute.sting.queue.util
import net.sf.picard.reference.ReferenceSequenceFileFactory
import java.io.File
import org.broadinstitute.sting.utils.GenomeLocParser
import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils
class PipelineUtils {
}
object PipelineUtils{
def smartSplitContigs(reference: File, intervals: File, sets: Int) : List[List[String]] = {
GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(reference))
val targets = IntervalUtils.parseIntervalArguments(List(intervals.getAbsolutePath), false)
// Build up a map of contigs with sizes.
var contigSizes = Map.empty[String, Long]
// todo -- make this look like functional code for Q's sake
//targets.foreach( loc => { contigSizes += loc -> { contigSizes.get(loc.getContig) match { case Some(size) => size + loc.size case None => loc.size } } })
for (loc <- targets) {
val contig = loc.getContig
val contigSize = loc.size
contigSizes += contig -> {
contigSizes.get(contig) match {
case Some(size) => size + contigSize
case None => contigSize
}
}
}
// Keep a list of pairs of sizes with lists of contigs.
var splitContigs = List.empty[(Long, List[String])]
for ((contigName, contigSize) <- contigSizes) {
if (splitContigs.size < sets) {
// If there are fewer than the requested number of sets, just add this contig.
splitContigs :+= contigSize -> List(contigName)
} else {
// If there is already a number of sets
// sort the contigs to get the smallest one first.
splitContigs = splitContigs.sortBy{case (size, contigs) => size}
// Update the pair with the new contig size and name.
var smallContigs = splitContigs.head
smallContigs = (smallContigs._1 + contigSize) -> (smallContigs._2 :+ contigName)
// Re add the pair to the list.
splitContigs = smallContigs :: splitContigs.tail
}
}
splitContigs.map{case (size, contigs) => contigs}
}
}