Added - a tool to fix reference bases of a VCF. The OMNI had a couple of sites with incorrect reference bases (look to be legacy from other chips), and a few more that had ref and alt flipped. GAP should probably take care of it, but since I need results by monday, I'm doing it.

Modified - SelectVariants: Hook up to VariantContextUtils to recalculate AC/AF/AN, which uses the accessor in VariantContext to do this. Somehow sites that were selected down to hom-ref genotypes only wound up getting positive AC. 

**IMPORTANT** I kind of need input here. The header of a file used for an integration test specifies AC as being an integer. Recalculating it casts it into an integer list (which it should be, as it allows for alternate alleles). However this appears to clash with what the jexl expression is looking for? For now, the integration test itself needed to be changed -- it's unclear what to do when the header specifies AC of being one class, but recalculating it casts to another class, and I'm not sure what to do.

I'm committing my omni_qc pipeline because I'm almost certain 2 months down the road I'm going to wonder what the heck I did to generate my results.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4511 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-10-17 03:18:01 +00:00
parent 7aa030a9a4
commit 2bc5971ca1
5 changed files with 490 additions and 83 deletions

View File

@ -212,6 +212,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
VariantContext sub = subsetRecord(vc, samples);
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
//System.out.printf("%s%n",sub.toString());
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
if ( !VariantContextUtils.match(sub, jexl) ) {
return 0;
@ -231,7 +232,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
* @param vc the VariantContext record to subset
* @param samples the samples to extract
* @return the subsetted VariantContext
*/
*/
private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
if ( samples == null || samples.isEmpty() )
return vc;
@ -246,17 +247,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes());
int alleleCount = 0;
int numberOfAlleles = 0;
int depth = 0;
for (String sample : sub.getSampleNames()) {
Genotype g = sub.getGenotype(sample);
if (g.isNotFiltered() && g.isCalled()) {
numberOfAlleles += g.getPloidy();
if (g.isHet()) { alleleCount++; }
else if (g.isHomVar()) { alleleCount += 2; }
String dp = (String) g.getAttribute("DP");
if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) {
@ -265,13 +260,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
}
attributes.put("AC", alleleCount);
attributes.put("AN", numberOfAlleles);
if (numberOfAlleles == 0) {
attributes.put("AF", 0.0);
} else {
attributes.put("AF", ((double) alleleCount) / ((double) numberOfAlleles));
}
VariantContextUtils.calculateChromosomeCounts(sub,attributes,false);
attributes.put("DP", depth);
sub = VariantContext.modifyAttributes(sub, attributes);

View File

@ -6,11 +6,14 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.report.tags.Analysis;
import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.List;
import java.util.Map;
/**
*
@ -40,6 +43,12 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
if ( ! (isValidVC(eval) && isValidVC(comp)) ) {
return null;
} else {
if ( missingField(eval) ) {
recalculateCounts(eval);
}
if ( missingField(comp) ) {
recalculateCounts(comp);
}
afTable.update(getAF(eval),getAF(comp));
acTable.update(getAC(eval),getAC(comp));
}
@ -47,16 +56,64 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
return null; // there is nothing interesting
}
private static boolean missingField(final VariantContext vc) {
return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
}
private static void recalculateCounts(VariantContext vc) {
Map<String,Object> attributes = vc.getAttributes();
VariantContextUtils.calculateChromosomeCounts(vc,attributes,false);
VariantContext.modifyAttributes(vc,attributes);
}
private static boolean isValidVC(final VariantContext vc) {
return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1);
}
private static double getAF(VariantContext vc) {
return ((List<Double>) vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)).get(0);
Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( List.class.isAssignableFrom(af.getClass())) {
return ( (List<Double>) af ).get(0);
} else if ( String.class.isAssignableFrom(af.getClass())) {
// two possibilities
String s = (String) af;
try {
if ( s.startsWith("[") ) {
return Double.parseDouble(s.replace("\\[","").replace("\\]",""));
} else {
return Double.parseDouble(s);
}
} catch (NumberFormatException e) {
throw new UserException("Allele frequency field may be improperly formatted, found AF="+s);
}
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
return (Double) af;
} else {
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass()));
}
}
private static int getAC(VariantContext vc) {
return ((List<Integer>) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0);
Object ac = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( List.class.isAssignableFrom(ac.getClass())) {
return ( (List<Integer>) ac ).get(0);
} else if ( String.class.isAssignableFrom(ac.getClass())) {
// two possibilities
String s = (String) ac;
try {
if ( s.startsWith("[") ) {
return Integer.parseInt(s.replace("\\[","").replace("\\]",""));
} else {
return Integer.parseInt(s);
}
} catch (NumberFormatException e) {
throw new UserException("Allele count field may be improperly formatted, found AC="+s);
}
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
return (Integer) ac;
} else {
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass()));
}
}
}

View File

@ -0,0 +1,174 @@
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Oct 16, 2010
* Time: 8:20:41 PM
* To change this template use File | Settings | File Templates.
*/
@Requires(value={},referenceMetaData=@RMD(name="variant", type= VariantContext.class))
public class FixRefBases extends RodWalker<Integer,Integer> {
@Output(doc="output file to write to",required=true)
VCFWriter out;
public void initialize() {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant"));
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
out.writeHeader(new VCFHeader(headerLines));
}
public Integer reduceInit() {
return 0;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker != null && tracker.hasROD("variant") ) {
VariantContext vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true);
VariantContext newContext = null;
if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) {
if ( basesAreFlipped(vc,ref) ) {
logger.warn(String.format("Variant context at %s has ref and alt bases flipped according to reference",ref.getLocus().toString()));
newContext = flipBases(vc,ref);
} else {
HashSet<Allele> newAlleles = new HashSet<Allele>(vc.getAlternateAlleles());
Allele refAllele = Allele.create(ref.getBase(),true);
newAlleles.add(refAllele);
newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(),
ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele),
vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes());
}
out.add(newContext,ref.getBase());
} else {
out.add(vc,ref.getBase());
}
}
return 0;
}
public Integer reduce(Integer map, Integer reduce) {
return map + reduce;
}
public void onTraversalDone(Integer fReduce) {
logger.info(String.format("Fixed %d records",fReduce));
}
private boolean basesAreFlipped(VariantContext vc, ReferenceContext reference) {
for ( Allele a : vc.getAlternateAlleles() ) {
if ( a.getBases().length == 1 && a.getBases()[0] == reference.getBase() ) {
return true;
}
}
return false;
}
private VariantContext flipBases(VariantContext vc, ReferenceContext ref) {
HashSet<Allele> newAlleles = new HashSet<Allele>(vc.getAlleles().size());
newAlleles.add(Allele.create(ref.getBase(),true));
newAlleles.add(Allele.create(vc.getReference().getBases()[0],false));
for ( Allele a : vc.getAlternateAlleles() ) {
if ( a.getBases()[0] != ref.getBase() ) {
newAlleles.add(a);
}
}
VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(),
ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles),
vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes());
Map<String,Object> attribs = new HashMap<String,Object>(newVC.getAttributes());
VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false);
VariantContext.modifyAttributes(newVC,attribs);
return newVC;
}
private Map<String, Genotype> fixGenotypes(Map<String,Genotype> old, Allele newRef) {
HashMap<String,Genotype> newGTs = new HashMap<String,Genotype>(old.size());
for ( Map.Entry<String,Genotype> e : old.entrySet() ) {
newGTs.put(e.getKey(),fixGenotype(e.getValue(),newRef));
}
return newGTs;
}
private Genotype fixGenotype(Genotype g, Allele newRef) {
List<Allele> newAlleles = new ArrayList<Allele>(g.getAlleles().size());
for ( Allele a : g.getAlleles() ) {
if ( a.isReference() ) {
newAlleles.add(newRef);
} else {
newAlleles.add(a);
}
}
return new Genotype(g.getSampleName(),
newAlleles,g.hasNegLog10PError() ? g.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
g.getAttributes().keySet(), g.getAttributes(),g.genotypesArePhased());
}
private Map<String,Genotype> flipGenotypes(Map<String,Genotype> old, Set<Allele> newAlleles) {
HashMap<String,Genotype> newGTs = new HashMap<String,Genotype>(old.size());
for ( Map.Entry<String,Genotype> e : old.entrySet() ) {
newGTs.put(e.getKey(),flipGenotype(e.getValue(),newAlleles));
}
return newGTs;
}
private Genotype flipGenotype(Genotype old, Set<Allele> newAlleles) {
Allele ref = null;
for ( Allele a : newAlleles ) {
if ( a.isReference() ) {
ref = a;
}
}
if ( ref == null ) {
throw new StingException("No reference allele in variant context with which to flip genotype alleles");
}
List<Allele> newGTAlleles = new ArrayList<Allele>(old.getAlleles().size());
for ( Allele a : old.getAlleles() ) {
if ( ! a.isNoCall() && a.getBases()[0] == ref.getBases()[0] ) {
newGTAlleles.add(ref);
} else {
if ( a.isReference() ) {
newGTAlleles.add(Allele.create(a.getBases(),false));
} else {
newGTAlleles.add(a);
}
}
}
return new Genotype(old.getSampleName(),
newGTAlleles,old.hasNegLog10PError() ? old.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
old.getAttributes().keySet(), old.getAttributes(),old.genotypesArePhased());
}
}

View File

@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'AF < 0.2' -B:variant,VCF " + testfile + " -NO_HEADER"),
baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF " + testfile + " -NO_HEADER"),
1,
Arrays.asList("3a15628b5980031c629c0c33e7e60b40")
Arrays.asList("e22db73fd99fd4f28d472d407e40ead5")
);
executeTest("testComplexSelection--" + testfile, spec);

View File

@ -1,6 +1,7 @@
import java.io.{FileReader, File, BufferedReader}
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
@ -10,6 +11,7 @@ import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
import scala.collection.mutable.HashMap
class omni_qc extends QScript {
qscript =>
@ -19,95 +21,279 @@ class omni_qc extends QScript {
var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf")
var august_calls_vcf = null
var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf")
var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf")
var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf")
var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/EUR.beagle.vcf.gz","vcf")
var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/ASN.beagle.vcf.gz","vcf")
var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/AFR.beagle.vcf.gz","vcf")
var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")
//var august_calls_other_genotypes = _
// OMNI VCF FILES
var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf")
var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf")
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf")
var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf")
var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf")
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf")
// INTERVALS
var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
var pilot1_interval_list: String = _
var hiseq_interval_list: String = _
var production_interval_list: String = "20"
var pilot1_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list"
var hiseq_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list"
var production_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b37_sites.chr20.interval.list"
// REFERENCES
var b36_ref = new File("/humgen/1kg/reference/human_b36_both.fasta")
var b37_ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
// OTHER
var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt")
val analysis_dir = "/broad/shptmp/chartl/omni/"
val resources_dir = analysis_dir + "resources/"
val scratch_dir = analysis_dir + "scratch/"
val eval_dir = analysis_dir + "eval/"
val vcf_dir = analysis_dir + "vcfs/"
trait OmniArgs extends CommandLineGATK {
this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta")
this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
}
protected def sampleFileToString(samFile: File) : List[String] = {
var reader = new BufferedReader(new FileReader(samFile))
var line: String = ""
var sampleList: List[String] = Nil
line = reader.readLine
while ( line != null ) {
sampleList :+= line
line = reader.readLine
}
return sampleList
}
class vcf2bed(b: String) extends CommandLineFunction {
class vcf2bed extends CommandLineFunction {
@Input(doc="A VCF file to be put into an interval list") var in_vcf: File = _
@Output(doc="An interval list file to be used with -L") var out_list: File = _
def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/vcf2bed.py %s %s".format(in_vcf.getAbsolutePath,out_list.getAbsolutePath)
}
protected def runme(samples: File, intervals: String, compVCF: TaggedFile, outDir: String, base: String) {
var subset = new SelectVariants with OmniArgs
subset.variantVCF = Omni_chip_vcf
subset.analysisName = "Subset_"+base
subset.sample = sampleFileToString(samples)
if ( intervals != null ) {
subset.intervalsString :+= intervals
}
subset.out = new TaggedFile(outDir+base+".subset.vcf","vcf")
var makeOmniSiteList = new vcf2bed("foo")
makeOmniSiteList.analysisName = "Omni_list_"+base
makeOmniSiteList.in_vcf = subset.out
makeOmniSiteList.out_list = new TaggedFile(outDir+base+".subset.omni.intervals.list","intervals.list")
class GetSampleOverlap extends CommandLineFunction {
@Input(doc="A list of VCF files for which to calculate the sample overlap") var in_vcfs: List[File] = Nil
@Output(doc="A file to which to write the overlapping sample names") var outFile: File = _
var eval = new VariantEval with OmniArgs
eval.rodBind :+= new RodBind("evalOmni","vcf",subset.out)
eval.rodBind :+= new RodBind("comp"+base,"vcf",compVCF)
eval.intervals = makeOmniSiteList.out_list
eval.evalModule :+= "GenotypeConcordance"
eval.evalModule :+= "SimpleMetricsBySample"
eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval")
eval.reportType = Some(VE2TemplateType.R)
eval.analysisName = base+"_Eval"
eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5")
eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+")
eval.disI = true
eval.outputVCF = outDir+base+".eval.interesting.sites.vcf"
/*def commandLine = "grep #CHR %s | sed 's/.vcf:/\\t/g' | cut -f11- | tr '\\t' '\\n' | sort | uniq -c | awk '$1 == %d' | awk '{print $2}' > %s".format(
in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath),
in_vcfs.size,
outFile.getAbsolutePath
)*/
def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/getOverlapSamples.py %s %s".format(
in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath),
outFile.getAbsolutePath
)
}
if ( base.equalsIgnoreCase("production") ) {
subset.variantVCF = Omni_b37_vcf
subset.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta")
eval.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta")
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
}
class GunzipFile extends CommandLineFunction {
@Input(doc="file to gunzip") var gunzipMe: File = _
@Output(doc="file to gunzip to") var outFile: File = _
add(subset,makeOmniSiteList,eval)
def commandLine = "gunzip -c %s > %s".format(gunzipMe.getAbsolutePath,outFile.getAbsolutePath)
}
def script = {
runme(pilot1_overlap_samples,pilot1_interval_list,pilot1_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot1/","pilot1")
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3")
runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq")
runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production")
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release")
/** Convert other chips to merged VCFs **/
//var august_call_other_chips: List[(String,File)] = processAuxiliaryChipData(august_calls_other_genotypes)
/** Unzip the pilot 1 VCFs and dump them into the resources directory **/
var gunzip_p1_ceu = new GunzipFile
var gunzip_p1_chb = new GunzipFile
var gunzip_p1_yri = new GunzipFile
var gunzip_hiseq = new GunzipFile
var gunzip_ag_eur = new GunzipFile
var gunzip_ag_asn = new GunzipFile
var gunzip_ag_afr = new GunzipFile
gunzip_p1_ceu.gunzipMe = pilot1_ceu_vcf
gunzip_p1_ceu.outFile = new File(resources_dir+"CEU.low_coverage.genotypes.vcf")
gunzip_p1_chb.gunzipMe = pilot1_chb_vcf
gunzip_p1_chb.outFile = new File(resources_dir+"CHB.low_coverage.genotypes.vcf")
gunzip_p1_yri.gunzipMe = pilot1_yri_vcf
gunzip_p1_yri.outFile = new File(resources_dir+"YRI.low_coverage.genotypes.vcf")
gunzip_hiseq.gunzipMe = hiseq_calls_vcf
gunzip_hiseq.outFile = new File(resources_dir+"HiSeq.b36.vcf")
gunzip_ag_eur.gunzipMe = august_calls_EUR_refined
gunzip_ag_eur.outFile = new File(resources_dir+"EUR.refined.vcf")
gunzip_ag_asn.gunzipMe = august_calls_ASN_refined
gunzip_ag_asn.outFile = new File(resources_dir+"ASN.refined.vcf")
gunzip_ag_afr.gunzipMe = august_calls_AFR_refined
gunzip_ag_afr.outFile = new File(resources_dir+"AFR.refined.vcf")
add(gunzip_p1_ceu,gunzip_p1_yri,gunzip_p1_chb,gunzip_hiseq,gunzip_ag_eur,gunzip_ag_asn,gunzip_ag_afr)
/** fix the omni ref bases **/
var fix_421 = new FixRefBases with OmniArgs
var fix_764 = new FixRefBases with OmniArgs
var fix_764_b37 = new FixRefBases with OmniArgs
fix_421.variantVCF = OMNI_hapmap_b36_vcf
fix_421.reference_sequence = b36_ref
fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf"))
fix_764.variantVCF = OMNI_b36_vcf
fix_764.reference_sequence = b36_ref
fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf"))
fix_764_b37.variantVCF = OMNI_b37_vcf
fix_764_b37.reference_sequence = b37_ref
fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf"))
add(fix_421,fix_764,fix_764_b37)
/** Propagate AC/AN annotations to Omni files via variant annotator **/
var annotate_421 = new VariantAnnotator with OmniArgs
var annotate_764 = new VariantAnnotator with OmniArgs
var annotate_764_b37 = new VariantAnnotator with OmniArgs
annotate_421.variantVCF = fix_421.out
annotate_421.reference_sequence = b36_ref
annotate_421.annotation :+= "ChromosomeCounts"
annotate_421.out = new File(vcf_dir+swapExt(fix_421.out.getName,".vcf",".annot.vcf"))
annotate_764.variantVCF = fix_764.out
annotate_764.reference_sequence = b36_ref
annotate_764.annotation :+= "ChromosomeCounts"
annotate_764.out = new File(vcf_dir+swapExt(fix_764.out.getName,".vcf",".annot.vcf"))
annotate_764_b37.variantVCF = fix_764_b37.out
annotate_764_b37.reference_sequence = b37_ref
annotate_764_b37.annotation :+= "ChromosomeCounts"
annotate_764_b37.out = new File(vcf_dir+swapExt(fix_764_b37.out.getName,".vcf",".annot.vcf"))
add(annotate_421,annotate_764,annotate_764_b37)
/** Eval the omni chip against the various comps **/
runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref)
runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref)
runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_yri.outFile,"OMNI_421","Pilot1_YRI",pilot1_interval_list, b36_ref)
runEval(annotate_764.out,pilot3_release_vcf,"OMNI_764","Pilot3",pilot3_interval_list, b36_ref)
runEval(annotate_421.out,pilot3_release_vcf,"OMNI_421","Pilot3",pilot3_interval_list, b36_ref)
runEval(annotate_764_b37.out,gunzip_ag_eur.outFile,"OMNI_764","August_EUR",production_interval_list, b37_ref)
runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref)
runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref)
runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref)
runEval(annotate_764.out,OMNI_hapmap_b36_vcf,"OMNI_764","OMNI_421",pilot1_interval_list,b36_ref)
var eval1KG_exclude = new VariantEval with OmniArgs
eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt"
eval1KG_exclude.rodBind :+= new RodBind("evalOMNI_764","VCF",annotate_764.out)
eval1KG_exclude.rodBind :+= new RodBind("compPilot3","VCF",pilot3_release_vcf)
eval1KG_exclude.evalModule :+= "GenotypeConcordance"
eval1KG_exclude.evalModule :+= "SimpleMetricsBySample"
eval1KG_exclude.reference_sequence = b36_ref
eval1KG_exclude.reportType = Some(VE2TemplateType.CSV)
eval1KG_exclude.intervalsString :+= pilot3_interval_list
eval1KG_exclude.out = new File(eval_dir+"%s_vs_%s.%s".format("OMNI_764","Pilot3","exclude.mixups.eval.csv"))
add(eval1KG_exclude)
runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile)
}
def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = {
// todo ==== me
return Nil
}
def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File) = {
var base = "%s_vs_%s".format(eBase,cBase)
var getOverlap = new GetSampleOverlap
getOverlap.in_vcfs :+= eval
getOverlap.in_vcfs :+= comp
getOverlap.outFile = new File(scratch_dir+base+".sample_overlap.txt")
add(getOverlap)
var vEval = new VariantEval with OmniArgs
vEval.samples :+= getOverlap.outFile.getAbsolutePath
vEval.rodBind :+= new RodBind("eval"+eBase,"VCF",eval)
vEval.rodBind :+= new RodBind("comp"+cBase,"VCF",comp)
vEval.evalModule :+= "GenotypeConcordance"
vEval.evalModule :+= "SimpleMetricsBySample"
vEval.intervalsString :+= intervals
vEval.reference_sequence = reference
vEval.reportType = Some(VE2TemplateType.CSV)
vEval.out = new File(eval_dir+base+".eval.csv")
add(vEval)
}
def swapExt(s: String, d: String, f: String) : String = {
return s.stripSuffix(d)+f
}
def runAFComparison(omni: File, p1ceu: File, p1asn: File, p1yri:File ) : Boolean = {
// step one, set up some of the info
var populations : List[String] = Nil // these are the pilot 1 populations
populations :+= "CEU"
populations :+= "CHBJPT"
populations :+= "YRI"
var panels : List[String] = Nil // these are the analysis panels
panels :+= "EUR"
panels :+= "ASN"
panels :+= "ASW"
panels :+= "AFR"
panels :+= "ADM"
// step two -- subset the OMNI chip to the actual sample names
var nameToSubset: HashMap[String,SelectVariants] = new HashMap[String,SelectVariants]
for ( p <- populations ) {
nameToSubset += p -> sampleSubset(p,omni)
}
for ( p <- panels ) {
nameToSubset += p -> sampleSubset(p,omni)
}
// step three -- compare the pilot 1 files against all populations and panels
runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out)
runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out)
runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out)
runComps("Pilot1CHBJPT",p1asn,"ASN",nameToSubset("ASN").out)
runComps("Pilot1YRI",p1yri,"YRI",nameToSubset("YRI").out)
runComps("Pilot1YRI",p1yri,"AFR",nameToSubset("AFR").out)
runComps("EUR",nameToSubset("EUR").out,"AFR",nameToSubset("AFR").out)
runComps("EUR",nameToSubset("EUR").out,"ASN",nameToSubset("ASN").out)
runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out)
runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out)
return true
}
def getOmniSampleListByPanel(panel: String) : String = {
return scratch_dir+"OMNI_764_%s.txt".format(panel)
}
def sampleSubset(panel: String, omni: File) : SelectVariants = {
var sv : SelectVariants = new SelectVariants with OmniArgs
sv.reference_sequence = b36_ref
sv.variantVCF = omni
sv.sample :+= getOmniSampleListByPanel(panel)
sv.out = new File(vcf_dir+swapExt(omni.getName,".vcf",".%s.vcf".format(panel)))
add(sv)
return sv
}
def runComps(eBase: String, evalVCF: File, cBase: String, compVCF: File) = {
var eval: VariantEval = new VariantEval with OmniArgs
eval.reference_sequence = b36_ref
eval.rodBind :+= new RodBind("eval%s".format(eBase),"VCF",evalVCF)
eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF)
eval.noStandard = true
eval.E :+= "AlleleFrequencyComparison"
add(eval)
var combine: CombineVariants = new CombineVariants with OmniArgs
combine.reference_sequence = b36_ref
combine.rodBind :+= new RodBind(eBase,"VCF",evalVCF)
combine.rodBind :+= new RodBind(cBase,"VCF",compVCF)
combine.out = new File(vcf_dir+"%s_plus_%s.vcf".format(eBase,cBase))
combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
combine.priority = "%s,%s".format(eBase,cBase)
add(combine)
}
}