diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index b602f31f9..e8e57986c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -212,6 +212,7 @@ public class SelectVariants extends RodWalker { VariantContext sub = subsetRecord(vc, samples); if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + //System.out.printf("%s%n",sub.toString()); for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { return 0; @@ -231,7 +232,7 @@ public class SelectVariants extends RodWalker { * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext - */ + */ private VariantContext subsetRecord(VariantContext vc, Set samples) { if ( samples == null || samples.isEmpty() ) return vc; @@ -246,17 +247,11 @@ public class SelectVariants extends RodWalker { HashMap attributes = new HashMap(sub.getAttributes()); - int alleleCount = 0; - int numberOfAlleles = 0; int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { - numberOfAlleles += g.getPloidy(); - - if (g.isHet()) { alleleCount++; } - else if (g.isHomVar()) { alleleCount += 2; } String dp = (String) g.getAttribute("DP"); if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) { @@ -265,13 +260,8 @@ public class SelectVariants extends RodWalker { } } - attributes.put("AC", alleleCount); - attributes.put("AN", numberOfAlleles); - if (numberOfAlleles == 0) { - attributes.put("AF", 0.0); - } else { - attributes.put("AF", ((double) alleleCount) / ((double) numberOfAlleles)); - } + + VariantContextUtils.calculateChromosomeCounts(sub,attributes,false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java index 16331ac03..924eed830 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java @@ -6,11 +6,14 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.report.tags.Analysis; import org.broadinstitute.sting.utils.report.tags.DataPoint; import org.broadinstitute.sting.utils.report.utils.TableType; +import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.util.List; +import java.util.Map; /** * @@ -40,6 +43,12 @@ public class AlleleFrequencyComparison extends VariantEvaluator { if ( ! (isValidVC(eval) && isValidVC(comp)) ) { return null; } else { + if ( missingField(eval) ) { + recalculateCounts(eval); + } + if ( missingField(comp) ) { + recalculateCounts(comp); + } afTable.update(getAF(eval),getAF(comp)); acTable.update(getAC(eval),getAC(comp)); } @@ -47,16 +56,64 @@ public class AlleleFrequencyComparison extends VariantEvaluator { return null; // there is nothing interesting } + private static boolean missingField(final VariantContext vc) { + return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + } + + private static void recalculateCounts(VariantContext vc) { + Map attributes = vc.getAttributes(); + VariantContextUtils.calculateChromosomeCounts(vc,attributes,false); + VariantContext.modifyAttributes(vc,attributes); + } + private static boolean isValidVC(final VariantContext vc) { - return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1); } private static double getAF(VariantContext vc) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)).get(0); + Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( List.class.isAssignableFrom(af.getClass())) { + return ( (List) af ).get(0); + } else if ( String.class.isAssignableFrom(af.getClass())) { + // two possibilities + String s = (String) af; + try { + if ( s.startsWith("[") ) { + return Double.parseDouble(s.replace("\\[","").replace("\\]","")); + } else { + return Double.parseDouble(s); + } + } catch (NumberFormatException e) { + throw new UserException("Allele frequency field may be improperly formatted, found AF="+s); + } + } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { + return (Double) af; + } else { + throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass())); + } } private static int getAC(VariantContext vc) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); + Object ac = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( List.class.isAssignableFrom(ac.getClass())) { + return ( (List) ac ).get(0); + } else if ( String.class.isAssignableFrom(ac.getClass())) { + // two possibilities + String s = (String) ac; + try { + if ( s.startsWith("[") ) { + return Integer.parseInt(s.replace("\\[","").replace("\\]","")); + } else { + return Integer.parseInt(s); + } + } catch (NumberFormatException e) { + throw new UserException("Allele count field may be improperly formatted, found AC="+s); + } + } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { + return (Integer) ac; + } else { + throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass())); + } } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java new file mode 100755 index 000000000..2bb238c4d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java @@ -0,0 +1,174 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Oct 16, 2010 + * Time: 8:20:41 PM + * To change this template use File | Settings | File Templates. + */ +@Requires(value={},referenceMetaData=@RMD(name="variant", type= VariantContext.class)) +public class FixRefBases extends RodWalker { + @Output(doc="output file to write to",required=true) + VCFWriter out; + + public void initialize() { + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + out.writeHeader(new VCFHeader(headerLines)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker != null && tracker.hasROD("variant") ) { + VariantContext vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true); + VariantContext newContext = null; + if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) { + if ( basesAreFlipped(vc,ref) ) { + logger.warn(String.format("Variant context at %s has ref and alt bases flipped according to reference",ref.getLocus().toString())); + newContext = flipBases(vc,ref); + } else { + HashSet newAlleles = new HashSet(vc.getAlternateAlleles()); + Allele refAllele = Allele.create(ref.getBase(),true); + newAlleles.add(refAllele); + newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), + ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele), + vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + } + + out.add(newContext,ref.getBase()); + + } else { + out.add(vc,ref.getBase()); + } + } + + return 0; + } + + public Integer reduce(Integer map, Integer reduce) { + return map + reduce; + } + + public void onTraversalDone(Integer fReduce) { + logger.info(String.format("Fixed %d records",fReduce)); + } + + private boolean basesAreFlipped(VariantContext vc, ReferenceContext reference) { + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.getBases().length == 1 && a.getBases()[0] == reference.getBase() ) { + return true; + } + } + return false; + } + + private VariantContext flipBases(VariantContext vc, ReferenceContext ref) { + HashSet newAlleles = new HashSet(vc.getAlleles().size()); + newAlleles.add(Allele.create(ref.getBase(),true)); + newAlleles.add(Allele.create(vc.getReference().getBases()[0],false)); + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.getBases()[0] != ref.getBase() ) { + newAlleles.add(a); + } + } + + VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), + ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles), + vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + + Map attribs = new HashMap(newVC.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false); + VariantContext.modifyAttributes(newVC,attribs); + return newVC; + } + + private Map fixGenotypes(Map old, Allele newRef) { + HashMap newGTs = new HashMap(old.size()); + for ( Map.Entry e : old.entrySet() ) { + newGTs.put(e.getKey(),fixGenotype(e.getValue(),newRef)); + } + + return newGTs; + } + + private Genotype fixGenotype(Genotype g, Allele newRef) { + List newAlleles = new ArrayList(g.getAlleles().size()); + for ( Allele a : g.getAlleles() ) { + if ( a.isReference() ) { + newAlleles.add(newRef); + } else { + newAlleles.add(a); + } + } + return new Genotype(g.getSampleName(), + newAlleles,g.hasNegLog10PError() ? g.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + g.getAttributes().keySet(), g.getAttributes(),g.genotypesArePhased()); + } + + private Map flipGenotypes(Map old, Set newAlleles) { + HashMap newGTs = new HashMap(old.size()); + for ( Map.Entry e : old.entrySet() ) { + newGTs.put(e.getKey(),flipGenotype(e.getValue(),newAlleles)); + } + + return newGTs; + } + + private Genotype flipGenotype(Genotype old, Set newAlleles) { + Allele ref = null; + for ( Allele a : newAlleles ) { + if ( a.isReference() ) { + ref = a; + } + } + if ( ref == null ) { + throw new StingException("No reference allele in variant context with which to flip genotype alleles"); + } + + List newGTAlleles = new ArrayList(old.getAlleles().size()); + + for ( Allele a : old.getAlleles() ) { + if ( ! a.isNoCall() && a.getBases()[0] == ref.getBases()[0] ) { + newGTAlleles.add(ref); + } else { + if ( a.isReference() ) { + newGTAlleles.add(Allele.create(a.getBases(),false)); + } else { + newGTAlleles.add(a); + } + } + } + + return new Genotype(old.getSampleName(), + newGTAlleles,old.hasNegLog10PError() ? old.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + old.getAttributes().keySet(), old.getAttributes(),old.genotypesArePhased()); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 37c992a27..6180eb99f 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'AF < 0.2' -B:variant,VCF " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF " + testfile + " -NO_HEADER"), 1, - Arrays.asList("3a15628b5980031c629c0c33e7e60b40") + Arrays.asList("e22db73fd99fd4f28d472d407e40ead5") ); executeTest("testComplexSelection--" + testfile, spec); diff --git a/scala/qscript/chartl/omni_qc.q b/scala/qscript/chartl/omni_qc.q index 39fc1a010..4a04f221b 100755 --- a/scala/qscript/chartl/omni_qc.q +++ b/scala/qscript/chartl/omni_qc.q @@ -1,6 +1,7 @@ import java.io.{FileReader, File, BufferedReader} import net.sf.picard.reference.FastaSequenceFile import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils import org.broadinstitute.sting.gatk.DownsampleType import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model import org.broadinstitute.sting.queue.extensions.gatk._ @@ -10,6 +11,7 @@ import org.broadinstitute.sting.queue.{QException, QScript} import collection.JavaConversions._ import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType +import scala.collection.mutable.HashMap class omni_qc extends QScript { qscript => @@ -19,95 +21,279 @@ class omni_qc extends QScript { var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf") var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf") var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf") - var august_calls_vcf = null + var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf") + var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf") + var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf") + var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/EUR.beagle.vcf.gz","vcf") + var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/ASN.beagle.vcf.gz","vcf") + var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/AFR.beagle.vcf.gz","vcf") var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf") + //var august_calls_other_genotypes = _ // OMNI VCF FILES - var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf") - var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf") - var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf") + var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf") + var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf") + var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf") // INTERVALS var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" - var pilot1_interval_list: String = _ - var hiseq_interval_list: String = _ - var production_interval_list: String = "20" + var pilot1_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" + var hiseq_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" + var production_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b37_sites.chr20.interval.list" + + // REFERENCES + var b36_ref = new File("/humgen/1kg/reference/human_b36_both.fasta") + var b37_ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta") // OTHER - var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt") + val analysis_dir = "/broad/shptmp/chartl/omni/" + val resources_dir = analysis_dir + "resources/" + val scratch_dir = analysis_dir + "scratch/" + val eval_dir = analysis_dir + "eval/" + val vcf_dir = analysis_dir + "vcfs/" trait OmniArgs extends CommandLineGATK { - this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta") this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") } - protected def sampleFileToString(samFile: File) : List[String] = { - var reader = new BufferedReader(new FileReader(samFile)) - var line: String = "" - var sampleList: List[String] = Nil - line = reader.readLine - while ( line != null ) { - sampleList :+= line - line = reader.readLine - } - - return sampleList - } - - class vcf2bed(b: String) extends CommandLineFunction { + class vcf2bed extends CommandLineFunction { @Input(doc="A VCF file to be put into an interval list") var in_vcf: File = _ @Output(doc="An interval list file to be used with -L") var out_list: File = _ def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/vcf2bed.py %s %s".format(in_vcf.getAbsolutePath,out_list.getAbsolutePath) } - protected def runme(samples: File, intervals: String, compVCF: TaggedFile, outDir: String, base: String) { - var subset = new SelectVariants with OmniArgs - subset.variantVCF = Omni_chip_vcf - subset.analysisName = "Subset_"+base - subset.sample = sampleFileToString(samples) - if ( intervals != null ) { - subset.intervalsString :+= intervals - } - subset.out = new TaggedFile(outDir+base+".subset.vcf","vcf") - - var makeOmniSiteList = new vcf2bed("foo") - makeOmniSiteList.analysisName = "Omni_list_"+base - makeOmniSiteList.in_vcf = subset.out - makeOmniSiteList.out_list = new TaggedFile(outDir+base+".subset.omni.intervals.list","intervals.list") + class GetSampleOverlap extends CommandLineFunction { + @Input(doc="A list of VCF files for which to calculate the sample overlap") var in_vcfs: List[File] = Nil + @Output(doc="A file to which to write the overlapping sample names") var outFile: File = _ - var eval = new VariantEval with OmniArgs - eval.rodBind :+= new RodBind("evalOmni","vcf",subset.out) - eval.rodBind :+= new RodBind("comp"+base,"vcf",compVCF) - eval.intervals = makeOmniSiteList.out_list - eval.evalModule :+= "GenotypeConcordance" - eval.evalModule :+= "SimpleMetricsBySample" - eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval") - eval.reportType = Some(VE2TemplateType.R) - eval.analysisName = base+"_Eval" - eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5") - eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+") - eval.disI = true - eval.outputVCF = outDir+base+".eval.interesting.sites.vcf" + /*def commandLine = "grep #CHR %s | sed 's/.vcf:/\\t/g' | cut -f11- | tr '\\t' '\\n' | sort | uniq -c | awk '$1 == %d' | awk '{print $2}' > %s".format( + in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), + in_vcfs.size, + outFile.getAbsolutePath + )*/ + def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/getOverlapSamples.py %s %s".format( + in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), + outFile.getAbsolutePath + ) + } - if ( base.equalsIgnoreCase("production") ) { - subset.variantVCF = Omni_b37_vcf - subset.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta") - eval.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta") - eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - } + class GunzipFile extends CommandLineFunction { + @Input(doc="file to gunzip") var gunzipMe: File = _ + @Output(doc="file to gunzip to") var outFile: File = _ - - - add(subset,makeOmniSiteList,eval) + def commandLine = "gunzip -c %s > %s".format(gunzipMe.getAbsolutePath,outFile.getAbsolutePath) } def script = { - runme(pilot1_overlap_samples,pilot1_interval_list,pilot1_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot1/","pilot1") - runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3") - runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq") - runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production") - runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release") + + /** Convert other chips to merged VCFs **/ + + //var august_call_other_chips: List[(String,File)] = processAuxiliaryChipData(august_calls_other_genotypes) + + + /** Unzip the pilot 1 VCFs and dump them into the resources directory **/ + + var gunzip_p1_ceu = new GunzipFile + var gunzip_p1_chb = new GunzipFile + var gunzip_p1_yri = new GunzipFile + var gunzip_hiseq = new GunzipFile + var gunzip_ag_eur = new GunzipFile + var gunzip_ag_asn = new GunzipFile + var gunzip_ag_afr = new GunzipFile + + gunzip_p1_ceu.gunzipMe = pilot1_ceu_vcf + gunzip_p1_ceu.outFile = new File(resources_dir+"CEU.low_coverage.genotypes.vcf") + gunzip_p1_chb.gunzipMe = pilot1_chb_vcf + gunzip_p1_chb.outFile = new File(resources_dir+"CHB.low_coverage.genotypes.vcf") + gunzip_p1_yri.gunzipMe = pilot1_yri_vcf + gunzip_p1_yri.outFile = new File(resources_dir+"YRI.low_coverage.genotypes.vcf") + gunzip_hiseq.gunzipMe = hiseq_calls_vcf + gunzip_hiseq.outFile = new File(resources_dir+"HiSeq.b36.vcf") + gunzip_ag_eur.gunzipMe = august_calls_EUR_refined + gunzip_ag_eur.outFile = new File(resources_dir+"EUR.refined.vcf") + gunzip_ag_asn.gunzipMe = august_calls_ASN_refined + gunzip_ag_asn.outFile = new File(resources_dir+"ASN.refined.vcf") + gunzip_ag_afr.gunzipMe = august_calls_AFR_refined + gunzip_ag_afr.outFile = new File(resources_dir+"AFR.refined.vcf") + + add(gunzip_p1_ceu,gunzip_p1_yri,gunzip_p1_chb,gunzip_hiseq,gunzip_ag_eur,gunzip_ag_asn,gunzip_ag_afr) + + /** fix the omni ref bases **/ + var fix_421 = new FixRefBases with OmniArgs + var fix_764 = new FixRefBases with OmniArgs + var fix_764_b37 = new FixRefBases with OmniArgs + + fix_421.variantVCF = OMNI_hapmap_b36_vcf + fix_421.reference_sequence = b36_ref + fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_764.variantVCF = OMNI_b36_vcf + fix_764.reference_sequence = b36_ref + fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_764_b37.variantVCF = OMNI_b37_vcf + fix_764_b37.reference_sequence = b37_ref + fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf")) + + add(fix_421,fix_764,fix_764_b37) + + /** Propagate AC/AN annotations to Omni files via variant annotator **/ + var annotate_421 = new VariantAnnotator with OmniArgs + var annotate_764 = new VariantAnnotator with OmniArgs + var annotate_764_b37 = new VariantAnnotator with OmniArgs + + annotate_421.variantVCF = fix_421.out + annotate_421.reference_sequence = b36_ref + annotate_421.annotation :+= "ChromosomeCounts" + annotate_421.out = new File(vcf_dir+swapExt(fix_421.out.getName,".vcf",".annot.vcf")) + annotate_764.variantVCF = fix_764.out + annotate_764.reference_sequence = b36_ref + annotate_764.annotation :+= "ChromosomeCounts" + annotate_764.out = new File(vcf_dir+swapExt(fix_764.out.getName,".vcf",".annot.vcf")) + annotate_764_b37.variantVCF = fix_764_b37.out + annotate_764_b37.reference_sequence = b37_ref + annotate_764_b37.annotation :+= "ChromosomeCounts" + annotate_764_b37.out = new File(vcf_dir+swapExt(fix_764_b37.out.getName,".vcf",".annot.vcf")) + + add(annotate_421,annotate_764,annotate_764_b37) + + /** Eval the omni chip against the various comps **/ + runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_yri.outFile,"OMNI_421","Pilot1_YRI",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,pilot3_release_vcf,"OMNI_764","Pilot3",pilot3_interval_list, b36_ref) + runEval(annotate_421.out,pilot3_release_vcf,"OMNI_421","Pilot3",pilot3_interval_list, b36_ref) + runEval(annotate_764_b37.out,gunzip_ag_eur.outFile,"OMNI_764","August_EUR",production_interval_list, b37_ref) + runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref) + runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref) + runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref) + runEval(annotate_764.out,OMNI_hapmap_b36_vcf,"OMNI_764","OMNI_421",pilot1_interval_list,b36_ref) + + var eval1KG_exclude = new VariantEval with OmniArgs + eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt" + eval1KG_exclude.rodBind :+= new RodBind("evalOMNI_764","VCF",annotate_764.out) + eval1KG_exclude.rodBind :+= new RodBind("compPilot3","VCF",pilot3_release_vcf) + eval1KG_exclude.evalModule :+= "GenotypeConcordance" + eval1KG_exclude.evalModule :+= "SimpleMetricsBySample" + eval1KG_exclude.reference_sequence = b36_ref + eval1KG_exclude.reportType = Some(VE2TemplateType.CSV) + eval1KG_exclude.intervalsString :+= pilot3_interval_list + eval1KG_exclude.out = new File(eval_dir+"%s_vs_%s.%s".format("OMNI_764","Pilot3","exclude.mixups.eval.csv")) + + add(eval1KG_exclude) + + runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile) + + } + + def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = { + // todo ==== me + return Nil + } + + def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File) = { + var base = "%s_vs_%s".format(eBase,cBase) + var getOverlap = new GetSampleOverlap + getOverlap.in_vcfs :+= eval + getOverlap.in_vcfs :+= comp + getOverlap.outFile = new File(scratch_dir+base+".sample_overlap.txt") + add(getOverlap) + + var vEval = new VariantEval with OmniArgs + vEval.samples :+= getOverlap.outFile.getAbsolutePath + vEval.rodBind :+= new RodBind("eval"+eBase,"VCF",eval) + vEval.rodBind :+= new RodBind("comp"+cBase,"VCF",comp) + vEval.evalModule :+= "GenotypeConcordance" + vEval.evalModule :+= "SimpleMetricsBySample" + vEval.intervalsString :+= intervals + vEval.reference_sequence = reference + vEval.reportType = Some(VE2TemplateType.CSV) + + vEval.out = new File(eval_dir+base+".eval.csv") + + add(vEval) + + } + + def swapExt(s: String, d: String, f: String) : String = { + return s.stripSuffix(d)+f + } + + def runAFComparison(omni: File, p1ceu: File, p1asn: File, p1yri:File ) : Boolean = { + // step one, set up some of the info + var populations : List[String] = Nil // these are the pilot 1 populations + populations :+= "CEU" + populations :+= "CHBJPT" + populations :+= "YRI" + var panels : List[String] = Nil // these are the analysis panels + panels :+= "EUR" + panels :+= "ASN" + panels :+= "ASW" + panels :+= "AFR" + panels :+= "ADM" + // step two -- subset the OMNI chip to the actual sample names + var nameToSubset: HashMap[String,SelectVariants] = new HashMap[String,SelectVariants] + for ( p <- populations ) { + nameToSubset += p -> sampleSubset(p,omni) + } + + for ( p <- panels ) { + nameToSubset += p -> sampleSubset(p,omni) + } + + // step three -- compare the pilot 1 files against all populations and panels + + runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out) + runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out) + runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out) + runComps("Pilot1CHBJPT",p1asn,"ASN",nameToSubset("ASN").out) + runComps("Pilot1YRI",p1yri,"YRI",nameToSubset("YRI").out) + runComps("Pilot1YRI",p1yri,"AFR",nameToSubset("AFR").out) + runComps("EUR",nameToSubset("EUR").out,"AFR",nameToSubset("AFR").out) + runComps("EUR",nameToSubset("EUR").out,"ASN",nameToSubset("ASN").out) + runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out) + runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out) + + return true + + } + + def getOmniSampleListByPanel(panel: String) : String = { + return scratch_dir+"OMNI_764_%s.txt".format(panel) + } + + def sampleSubset(panel: String, omni: File) : SelectVariants = { + var sv : SelectVariants = new SelectVariants with OmniArgs + sv.reference_sequence = b36_ref + sv.variantVCF = omni + sv.sample :+= getOmniSampleListByPanel(panel) + sv.out = new File(vcf_dir+swapExt(omni.getName,".vcf",".%s.vcf".format(panel))) + add(sv) + return sv + } + + def runComps(eBase: String, evalVCF: File, cBase: String, compVCF: File) = { + var eval: VariantEval = new VariantEval with OmniArgs + eval.reference_sequence = b36_ref + eval.rodBind :+= new RodBind("eval%s".format(eBase),"VCF",evalVCF) + eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF) + eval.noStandard = true + eval.E :+= "AlleleFrequencyComparison" + + add(eval) + + var combine: CombineVariants = new CombineVariants with OmniArgs + combine.reference_sequence = b36_ref + combine.rodBind :+= new RodBind(eBase,"VCF",evalVCF) + combine.rodBind :+= new RodBind(cBase,"VCF",compVCF) + combine.out = new File(vcf_dir+"%s_plus_%s.vcf".format(eBase,cBase)) + combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) + combine.priority = "%s,%s".format(eBase,cBase) + + add(combine) + } }