diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java index 9c68a9e84..c61cd74ec 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java @@ -261,13 +261,16 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva qualityScoreHistograms = new QualityScoreHistograms(); } + if ( alleleCountStats == null && eval != null && validation != null ) { + alleleCountStats = new ACStats(eval,validation,Genotype.Type.values().length); + alleleCountSummary = new ACSummaryStats(eval, validation); + } + if (sampleStats == null) { if (eval != null) { // initialize the concordance table sampleStats = new SampleStats(eval,Genotype.Type.values().length); - alleleCountStats = new ACStats(eval,Genotype.Type.values().length, new CompACNames(veWalker.getLogger())); sampleSummaryStats = new SampleSummaryStats(eval); - alleleCountSummary = new ACSummaryStats(eval, new CompACNames(veWalker.getLogger())); for (final VariantContext vc : missedValidationData) { determineStats(null, vc); } @@ -319,10 +322,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } sampleStats.incrValue(sample, truth, called); - if ( evalAC != null ) { + if ( evalAC != null && validationAC != null) { alleleCountStats.incrValue(evalAC,truth,called); - } - if ( validationAC != null ) { alleleCountStats.incrValue(validationAC,truth,called); } } @@ -510,24 +511,21 @@ class SampleStats implements TableType { * Sample stats, but for AC */ class ACStats extends SampleStats { - private final CompACNames myComp; private String[] rowKeys; - public ACStats(VariantContext vc, int nGenotypeTypes, CompACNames comp) { + public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { super(nGenotypeTypes); - rowKeys = new String[2+4*vc.getGenotypes().size()]; - for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... + rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; + for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); rowKeys[i] = String.format("evalAC%d",i); } - for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { + for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[1+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i); + rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); } - - myComp = comp; } public String getName() { @@ -717,23 +715,21 @@ class SampleSummaryStats implements TableType { * SampleSummaryStats .. but for allele counts */ class ACSummaryStats extends SampleSummaryStats { - final private CompACNames myComp; private String[] rowKeys; - public ACSummaryStats (final VariantContext vc, CompACNames comp) { + public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - rowKeys = new String[3+4*vc.getGenotypes().size()]; + rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; rowKeys[0] = ALL_SAMPLES_KEY; - for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) { + for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); rowKeys[i+1] = String.format("evalAC%d",i); } - for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) { + for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[2+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i); + rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); } - myComp = comp; } public String getName() { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index ce83022cc..f2dc051d7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker { vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length)); VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false); - if ( VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { + if ( originalVC.isVariant() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0))); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java new file mode 100755 index 000000000..7d4b3a4ce --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/annotator/HammingDistance.java @@ -0,0 +1,111 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.annotator; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeaderLineType; +import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Oct 20, 2010 + * Time: 3:08:06 PM + * To change this template use File | Settings | File Templates. + */ +public class HammingDistance implements ExperimentalAnnotation, InfoFieldAnnotation { + + public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( tracker == null ) { + return null; + } + VariantContext hamCon = tracker.getVariantContext(ref,"hamming",null,ref.getLocus(),true); + if ( hamCon == null ) { + return null; + } + + Set interSamples = new HashSet(vc.getSampleNames()); + interSamples.retainAll(hamCon.getSampleNames()); + + int dist = 0; + int nrd_num = 0; + int hamCalls = 0; + int vcCallsAtHamCalls = 0; + int num_variant = 0; + + for ( String s : interSamples ) { + dist += dist(vc.getGenotype(s),hamCon.getGenotype(s),true); + nrd_num += dist(vc.getGenotype(s),hamCon.getGenotype(s),false); + if ( vc.getGenotype(s).isHet() || vc.getGenotype(s).isHomVar() || hamCon.getGenotype(s).isHet() || hamCon.getGenotype(s).isHomVar() ) { + num_variant ++; + } + if ( hamCon.getGenotype(s).isCalled() ) { + hamCalls++; + if ( vc.getGenotype(s).isCalled() ) { + vcCallsAtHamCalls++; + } + } + + } + + HashMap map = new HashMap(1); + map.put("HMD",dist); + map.put("HCR",(0.0+vcCallsAtHamCalls)/(0.0+hamCalls)); + map.put("NRD",(0.0+nrd_num)/(0.0+num_variant)); + map.put("OGC",(0.0+nrd_num)/(0.0+interSamples.size())); + return map; + + } + + public int dist(Genotype a, Genotype b, boolean weightByAC) { + if ( a.isNoCall() || b.isNoCall() ) { + return 0; + } + if ( weightByAC ) { + if ( a.isHomRef() ) { + if ( b.isHomVar() ) { + return 2; + } else if ( b.isHet() ) { + return 1; + } else { + return 0; + } + } else if ( a.isHet() ) { + if ( b.isHom() ) { + return 1; + } else { + return 0; + } + } else { + if ( b.isHomRef() ) { + return 2; + } else if ( b.isHet() ) { + return 1; + } else { + return 0; + } + } + } else { + if ( ! a.equals(b) ) { + return 1; + } else { + return 0; + } + } + } + + public List getKeyNames() { return Arrays.asList("HMD","HCR","NRD","OGC"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HMD",1, VCFHeaderLineType.Integer,"The hamming distance between record in Hamming ROD and this record"), + new VCFInfoHeaderLine("HCR",1,VCFHeaderLineType.Float,"The differential call rate between record in Hamming ROD and this record"), + new VCFInfoHeaderLine("NRD",1,VCFHeaderLineType.Float,"The Non-reference discrepancy between Hamming ROD and this record"), + new VCFInfoHeaderLine("OGC",1,VCFHeaderLineType.Float,"The Overall Genotype Concordance between Hamming ROD and this one")); } + + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java index 924eed830..80b6020e4 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.report.tags.DataPoint; import org.broadinstitute.sting.utils.report.utils.TableType; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -43,27 +44,38 @@ public class AlleleFrequencyComparison extends VariantEvaluator { if ( ! (isValidVC(eval) && isValidVC(comp)) ) { return null; } else { - if ( missingField(eval) ) { + // todo -- this is a godawful hack. The "right way" isn't working, so do it the unsafe way for now. Note that + // todo -- this precludes getting the AC/AF values from the info field because some may not be there... + /*if ( missingField(eval) ) { recalculateCounts(eval); } if ( missingField(comp) ) { recalculateCounts(comp); - } - afTable.update(getAF(eval),getAF(comp)); - acTable.update(getAC(eval),getAC(comp)); + }*/ + HashMap evalCounts = new HashMap(2); + HashMap compCounts = new HashMap(2); + + VariantContextUtils.calculateChromosomeCounts(eval,evalCounts,false); + VariantContextUtils.calculateChromosomeCounts(comp,compCounts,false); + afTable.update(((List)evalCounts.get("AF")).get(0),((List)compCounts.get("AF")).get(0)); + acTable.update(((List)evalCounts.get("AC")).get(0),((List)compCounts.get("AC")).get(0)); } return null; // there is nothing interesting } private static boolean missingField(final VariantContext vc) { - return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ); } - private static void recalculateCounts(VariantContext vc) { - Map attributes = vc.getAttributes(); + private void recalculateCounts(VariantContext vc) { + Map attributes = new HashMap(); VariantContextUtils.calculateChromosomeCounts(vc,attributes,false); - VariantContext.modifyAttributes(vc,attributes); + vc = VariantContext.modifyAttributes(vc,attributes); + getVEWalker().getLogger().debug(String.format("%s %s | %s %s",attributes.get("AC"),attributes.get("AF"),vc.getAttribute("AC"),vc.getAttribute("AF"))); + if ( attributes.size() == 2 && missingField(vc) ) { + throw new org.broadinstitute.sting.utils.exceptions.StingException("VariantContext should have had attributes modified but did not"); + } } private static boolean isValidVC(final VariantContext vc) { @@ -72,7 +84,11 @@ public class AlleleFrequencyComparison extends VariantEvaluator { private static double getAF(VariantContext vc) { Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( List.class.isAssignableFrom(af.getClass())) { + if ( af == null ) { + //throw new UserException("Variant context "+vc.getName()+" does not have allele frequency entry which is required for this walker"); + // still none after being re-computed; this is 0.00 + return 0.00; + } else if ( List.class.isAssignableFrom(af.getClass())) { return ( (List) af ).get(0); } else if ( String.class.isAssignableFrom(af.getClass())) { // two possibilities @@ -84,7 +100,7 @@ public class AlleleFrequencyComparison extends VariantEvaluator { return Double.parseDouble(s); } } catch (NumberFormatException e) { - throw new UserException("Allele frequency field may be improperly formatted, found AF="+s); + throw new UserException("Allele frequency field may be improperly formatted, found AF="+s,e); } } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { return (Double) af; @@ -94,8 +110,11 @@ public class AlleleFrequencyComparison extends VariantEvaluator { } private static int getAC(VariantContext vc) { - Object ac = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( List.class.isAssignableFrom(ac.getClass())) { + Object ac = vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); + if ( ac == null ) { + // still none after being re computed; this is 0 + return 0; + } else if ( List.class.isAssignableFrom(ac.getClass())) { return ( (List) ac ).get(0); } else if ( String.class.isAssignableFrom(ac.getClass())) { // two possibilities @@ -107,9 +126,9 @@ public class AlleleFrequencyComparison extends VariantEvaluator { return Integer.parseInt(s); } } catch (NumberFormatException e) { - throw new UserException("Allele count field may be improperly formatted, found AC="+s); + throw new UserException(String.format("Allele count field may be improperly formatted, found AC=%s for record %s:%d",ac,vc.getChr(),vc.getStart()),e); } - } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { + } else if ( Integer.class.isAssignableFrom(ac.getClass())) { return (Integer) ac; } else { throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass())); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java index 71c6b4b8f..1fddcfc0e 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java @@ -3,10 +3,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFWriter; +import org.broad.tribble.vcf.*; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; @@ -21,6 +18,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.*; import java.util.*; @@ -46,7 +44,8 @@ public class FixRefBases extends RodWalker { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); - headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + headerLines.add(new VCFHeaderLine("source", "FixRefBases")); + headerLines.add(new VCFInfoHeaderLine("FRI",1,VCFHeaderLineType.String,"The Fix-Ref-Info (if present, either \"Flipped\" or \"Fixed\")")); out.writeHeader(new VCFHeader(headerLines,vcfSamples)); } @@ -56,7 +55,13 @@ public class FixRefBases extends RodWalker { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker != null && tracker.hasROD("variant") ) { - VariantContext vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true); + VariantContext vc = null; + try { + vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true); + } catch ( ReviewedStingException e ) { + logger.warn("Multiple variants found, catching exception ",e); + return 0; + } VariantContext newContext = null; if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) { if ( basesAreFlipped(vc,ref) ) { @@ -66,13 +71,19 @@ public class FixRefBases extends RodWalker { HashSet newAlleles = new HashSet(vc.getAlternateAlleles()); Allele refAllele = Allele.create(ref.getBase(),true); newAlleles.add(refAllele); + HashMap newAttributes = new HashMap(vc.getAttributes()); + newAttributes.put("FRI",String.format("Fixed%s-%s",vc.getReference().toString(),refAllele)); newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele), vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + vc.isFiltered() ? vc.getFilters() : null, newAttributes); } + if ( ! newContext.hasAttribute("FRI") ) { + throw new StingException("FRI for fixed base not propagated. vc="+vc.toString()); + } out.add(newContext,ref.getBase()); + return 1; } else { out.add(vc,ref.getBase()); @@ -100,9 +111,12 @@ public class FixRefBases extends RodWalker { } private VariantContext flipBases(VariantContext vc, ReferenceContext ref) { + logger.info(String.format("Flipping bases at variant position %s:%d",vc.getChr(),vc.getStart())); HashSet newAlleles = new HashSet(vc.getAlleles().size()); newAlleles.add(Allele.create(ref.getBase(),true)); newAlleles.add(Allele.create(vc.getReference().getBases()[0],false)); + Map attribs = new HashMap(vc.getAttributes()); + attribs.put("FRI",String.format("Flipped%s-%s",vc.getReference().toString(),Allele.create(ref.getBase(),true).toString())); for ( Allele a : vc.getAlternateAlleles() ) { if ( a.getBases()[0] != ref.getBase() ) { newAlleles.add(a); @@ -112,9 +126,8 @@ public class FixRefBases extends RodWalker { VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles), vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, - vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + vc.isFiltered() ? vc.getFilters() : null, attribs); - Map attribs = new HashMap(newVC.getAttributes()); VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false); VariantContext.modifyAttributes(newVC,attribs); return newVC; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index c35e3e4c0..a1aafae45 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -29,7 +29,7 @@ public class String extraArgs = "-L 1:1-10,000,000"; for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s", - 1, Arrays.asList("158ac8e6d32eb2ea1bbeebfa512965de")); + 1, Arrays.asList("8a0203f0533b628ad7f1f230a43f105f")); executeTestParallel("testSelect1", spec); //executeTest("testSelect1", spec); } @@ -39,7 +39,7 @@ public class public void testSelect2() { String extraArgs = "-L 1:1-10,000,000"; WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s", - 1, Arrays.asList("cee96f61ffa1d042fe0c63550c508ec9")); + 1, Arrays.asList("a55653980b750fdc8396eecb00e3b18c")); executeTestParallel("testSelect2", spec); //executeTest("testSelect2", spec); } @@ -60,8 +60,8 @@ public class @Test public void testVESimple() { HashMap expectations = new HashMap(); - expectations.put("-L 1:1-10,000,000", "ff8c4ba16c7c14b4edbaf440f20641f9"); - expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "9b509ce5d31658eb09bb9597799b2908"); + expectations.put("-L 1:1-10,000,000", "6c9a12fdd62672b69b2ed4f0bc7e8f97"); + expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "6a52a6454121974b71d9cfe2dad68c28"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs = entry.getKey(); @@ -84,10 +84,10 @@ public class " -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; - String matchingMD5 = "1e6d6e152c9a90513dd5b6eaa3729388"; + String matchingMD5 = "7a8183402fe29f9c62c3c3cc4d45b46e"; expectations.put("", matchingMD5); expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5); - expectations.put(" -known comp_hapmap", "d28dd504017f39a91cde8e6f096879d6"); + expectations.put(" -known comp_hapmap", "519b590635b7de035f8d5971115b47ba"); for (String tests : testsEnumerations) { for (Map.Entry entry : expectations.entrySet()) { String extraArgs2 = entry.getKey(); @@ -124,7 +124,7 @@ public class for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s -NO_HEADER", 2, - Arrays.asList("6b97a019402b3984fead9a4e8b7c7c2a", "989bc30dea6c8a4cf771cd1b9fdab488")); + Arrays.asList("50321436a65ef7d574286cb0a1c55f7e", "989bc30dea6c8a4cf771cd1b9fdab488")); executeTestParallel("testVEWriteVCF", spec); //executeTest("testVEWriteVCF", spec); } diff --git a/scala/qscript/chartl/omni_qc.q b/scala/qscript/chartl/omni_qc.q index 4a04f221b..be20c4f6b 100755 --- a/scala/qscript/chartl/omni_qc.q +++ b/scala/qscript/chartl/omni_qc.q @@ -24,17 +24,21 @@ class omni_qc extends QScript { var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf") var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf") var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf") - var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/EUR.beagle.vcf.gz","vcf") - var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/ASN.beagle.vcf.gz","vcf") - var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/AFR.beagle.vcf.gz","vcf") + var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/EUR.beagle.vcf.gz","vcf") + var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/ASN.beagle.vcf.gz","vcf") + var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/AFR.beagle.vcf.gz","vcf") var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf") + var pilot1_na12878_beagle = new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/beagle/lowpass.N60.recal.CEUTSI.bgl.output.vcf") //var august_calls_other_genotypes = _ // OMNI VCF FILES var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf") var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf") var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf") + var OMNI_b36_panel_vcf = new TaggedFile("/broad/shptmp/chartl/omni/vcfs/Omni_b36_with_panel_sets.vcf","vcf") + var OMNI_b37_birdseed = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_birdseed_only.vcf") + var OMNI_b37_joint = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_joint_birdseed_lowpass.vcf") // INTERVALS var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" @@ -52,6 +56,14 @@ class omni_qc extends QScript { val scratch_dir = analysis_dir + "scratch/" val eval_dir = analysis_dir + "eval/" val vcf_dir = analysis_dir + "vcfs/" + val p1_ceu_only = scratch_dir+"Pilot1_CEU_only_sites.intervals.list" + val p1_chbjpt_only = scratch_dir+"Pilot1_CHBJPT_only_sites.intervals.list" + val p1_yri_only = scratch_dir+"Pilot1_YRI_only_sites.intervals.list" + + // OTHER CHIPS + + val OMNI_QUAD_1KG = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_OMNI.ref_fixed.vcf") + val AFFY_6_0 = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_ARRAY.ref_fixed.vcf") trait OmniArgs extends CommandLineGATK { this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") @@ -128,38 +140,42 @@ class omni_qc extends QScript { fix_421.variantVCF = OMNI_hapmap_b36_vcf fix_421.reference_sequence = b36_ref fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_421.bypassException = true fix_764.variantVCF = OMNI_b36_vcf fix_764.reference_sequence = b36_ref fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_764.bypassException = true fix_764_b37.variantVCF = OMNI_b37_vcf fix_764_b37.reference_sequence = b37_ref fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf")) - + fix_764_b37.bypassException = true + add(fix_421,fix_764,fix_764_b37) - /** Propagate AC/AN annotations to Omni files via variant annotator **/ + /** Propagate AC/AN annotations to Omni files via variant annotator **/ var annotate_421 = new VariantAnnotator with OmniArgs var annotate_764 = new VariantAnnotator with OmniArgs var annotate_764_b37 = new VariantAnnotator with OmniArgs - annotate_421.variantVCF = fix_421.out + annotate_421.variantVCF = OMNI_hapmap_b36_vcf annotate_421.reference_sequence = b36_ref annotate_421.annotation :+= "ChromosomeCounts" - annotate_421.out = new File(vcf_dir+swapExt(fix_421.out.getName,".vcf",".annot.vcf")) - annotate_764.variantVCF = fix_764.out + annotate_421.out = new File(vcf_dir+swapExt(annotate_421.variantVCF.getName,".vcf",".annot.vcf")) + annotate_764.variantVCF = OMNI_b36_vcf annotate_764.reference_sequence = b36_ref annotate_764.annotation :+= "ChromosomeCounts" - annotate_764.out = new File(vcf_dir+swapExt(fix_764.out.getName,".vcf",".annot.vcf")) - annotate_764_b37.variantVCF = fix_764_b37.out + annotate_764.out = new File(vcf_dir+swapExt(annotate_764.variantVCF.getName,".vcf",".annot.vcf")) + annotate_764_b37.variantVCF = OMNI_b37_vcf annotate_764_b37.reference_sequence = b37_ref annotate_764_b37.annotation :+= "ChromosomeCounts" - annotate_764_b37.out = new File(vcf_dir+swapExt(fix_764_b37.out.getName,".vcf",".annot.vcf")) + annotate_764_b37.out = new File(vcf_dir+swapExt(annotate_764_b37.variantVCF.getName,".vcf",".annot.vcf")) add(annotate_421,annotate_764,annotate_764_b37) /** Eval the omni chip against the various comps **/ runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref) - runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref,true) + //runEval(OMNI_hapmap_b36_vcf,gunzip_p1_ceu.outFile,"OMNI_421_Unfixed","Pilot1_CEU",pilot1_interval_list,b36_ref) runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref) runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref) runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref) @@ -170,7 +186,13 @@ class omni_qc extends QScript { runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref) runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref) runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref) - runEval(annotate_764.out,OMNI_hapmap_b36_vcf,"OMNI_764","OMNI_421",pilot1_interval_list,b36_ref) + runEval(annotate_764.out,annotate_421.out,"OMNI_764","OMNI_421_FIXED",pilot1_interval_list,b36_ref) + runEval(annotate_764.out,OMNI_QUAD_1KG,"OMNI_764","OMNI_QUAD",pilot1_interval_list,b36_ref) + runEval(annotate_764.out,AFFY_6_0,"OMNI_764","AFFY_6_0",pilot1_interval_list,b36_ref) + runEval(OMNI_b37_birdseed,gunzip_ag_eur.outFile,"OMNI_birdseed","August_EUR",production_interval_list,b37_ref) + runEval(OMNI_b37_joint,gunzip_ag_eur.outFile,"OMNI_joint","August_EUR",production_interval_list,b37_ref) + runEval(OMNI_QUAD_1KG,gunzip_p1_ceu.outFile,"OMNI_QUAD_1KG","Pilot1_CEU",pilot1_interval_list,b36_ref) + runEval(AFFY_6_0,gunzip_p1_ceu.outFile,"AFFY_6_0","Pilot1_CEU",pilot1_interval_list,b36_ref) var eval1KG_exclude = new VariantEval with OmniArgs eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt" @@ -187,6 +209,31 @@ class omni_qc extends QScript { runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile) + var subset421: SelectVariants = new SelectVariants with OmniArgs + subset421.reference_sequence = b36_ref + subset421.sample :+= (new File(scratch_dir+"OMNI_421_vs_Pilot1_CEU.sample_overlap.txt")).getAbsolutePath + subset421.variantVCF = annotate_764.out + subset421.out = new File(vcf_dir+swapExt(annotate_764.out.getName,".vcf",".subset.pilot1CEU.vcf")) + + add(subset421)// lastly to find things in the three-way pilot venn + + var combine: CombineVariants = new CombineVariants with OmniArgs + combine.reference_sequence = b36_ref + combine.rodBind :+= new RodBind("CEU","VCF",gunzip_p1_ceu.outFile) + combine.rodBind :+= new RodBind("ASN","VCF",gunzip_p1_chb.outFile) + combine.rodBind :+= new RodBind("YRI","VCF",gunzip_p1_yri.outFile) + combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) + combine.priority = "%s,%s,%s".format("CEU","ASN","YRI") + combine.out = new File(vcf_dir+"Pilot1_Populations_Combined.vcf") + + add(combine) + + selectSites(OMNI_b36_panel_vcf,p1_ceu_only,"ceu_only_sites") + selectSites(OMNI_b36_panel_vcf,p1_chbjpt_only,"chbjpt_only_sites") + selectSites(OMNI_b36_panel_vcf,p1_yri_only,"yri_only_sites") + + runBeagleAnalysis(new File(vcf_dir + "Illumina_HapMap_Omni_2.5_764samples.annot.stripped.vcf")) + } def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = { @@ -194,7 +241,7 @@ class omni_qc extends QScript { return Nil } - def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File) = { + def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File, interesting: Boolean = false) = { var base = "%s_vs_%s".format(eBase,cBase) var getOverlap = new GetSampleOverlap getOverlap.in_vcfs :+= eval @@ -214,6 +261,11 @@ class omni_qc extends QScript { vEval.out = new File(eval_dir+base+".eval.csv") + if ( interesting ) { + vEval.discordantInteresting = true + vEval.outputVCF = new File(vcf_dir+"%s_vs_%s.interesting_sites.vcf".format(eBase,cBase)) + } + add(vEval) } @@ -245,7 +297,7 @@ class omni_qc extends QScript { } // step three -- compare the pilot 1 files against all populations and panels - + runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out) runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out) runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out) @@ -257,6 +309,20 @@ class omni_qc extends QScript { runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out) runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out) + + var panelCombine: CombineVariants = new CombineVariants with OmniArgs + panelCombine.reference_sequence = b36_ref + panelCombine.priority = "" + for ( p <- panels ) { + panelCombine.rodBind :+= new RodBind(p,"VCF",nameToSubset(p).out) + panelCombine.priority = if ( panelCombine.priority.equals("") ) p else panelCombine.priority + "," + p + } + panelCombine.out = OMNI_b36_panel_vcf + panelCombine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE) + panelCombine.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION) + panelCombine.setKey = "panel" + + add(panelCombine) return true } @@ -282,6 +348,8 @@ class omni_qc extends QScript { eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF) eval.noStandard = true eval.E :+= "AlleleFrequencyComparison" + eval.reportType = Some(VE2TemplateType.CSV) + eval.out = new File(eval_dir+"%s_vs_%s_allele_frequency.eval".format(eBase,cBase)) add(eval) @@ -293,7 +361,147 @@ class omni_qc extends QScript { combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) combine.priority = "%s,%s".format(eBase,cBase) - add(combine) + //add(combine) } + + def selectSites(vcf: File, intervals: String, base: String) { + var sv = new SelectVariants with OmniArgs + sv.reference_sequence = b36_ref + sv.variantVCF = vcf + sv.out = swapExt(vcf,".vcf",base+".vcf") + sv.intervalsString :+= intervals + + add(sv) + } + + def runBeagleAnalysis(omnivcf: File) { + var combine : CombineVariants = new CombineVariants with OmniArgs + combine.reference_sequence = b36_ref + for ( c <- 1 until 23) { + combine.rodBind :+= new RodBind("beagle%s".format(c),"VCF",runBeagle(omnivcf,"%s".format(c))) + if ( c > 1 ) { + combine.priority = combine.priority+",%s%s".format("beagle",c) + } else { + combine.priority = "%s%s".format("beagle",c) + } + } + combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.PRIORITIZE) + combine.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION) + + combine.out = swapExt(pilot1_with_na12878_vcf,".vcf",".beagle_refined_with_omni.vcf") + + add(combine) + + var select : SelectVariants = new SelectVariants with OmniArgs + select.reference_sequence = b36_ref + select.variantVCF = combine.out + select.sample :+= "NA12878" + select.out = new File(vcf_dir + "NA12878.lowpass.beagle.refined.with.pilot1.vcf") + + add(select) + + var eval : VariantEval = new VariantEval with OmniArgs + eval.reference_sequence = b36_ref + eval.rodBind :+= new RodBind("evalNA12878LowPass","VCF",select.out) + eval.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) + eval.E :+= "GenotypeConcordance" + eval.out = new File(eval_dir+"NA12878.lowpass.beagle.vs.HiSeq.eval") + eval.excludeIntervals :+= new File(pilot1_interval_list) + eval.reportType = Some(VE2TemplateType.CSV) + + add(eval) + + var eval2: VariantEval = new VariantEval with OmniArgs + eval2.reference_sequence = b36_ref + eval2.rodBind :+= new RodBind("evalNA12878Beagle","VCF",pilot1_na12878_beagle) + eval2.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) + eval2.E :+= "GenotypeConcordance" + eval2.sample :+= "NA12878" + eval2.out = new File(eval_dir+"NA12878.lowpass.nochip.vs.Hiseq.eval") + eval2.excludeIntervals :+= new File(pilot1_interval_list) + eval2.reportType = Some(VE2TemplateType.CSV) + + add(eval2) + + var eval3: VariantEval = new VariantEval with OmniArgs + eval3.reference_sequence = b36_ref + eval3.rodBind :+= new RodBind("evalNA12878NoBeagle","VCF",pilot1_with_na12878_vcf) + eval3.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf) + eval3.E :+= "GenotypeConcordance" + eval3.sample :+= "NA12878" + eval3.out = new File(eval_dir+"NA12878.lowpass.nochip.norefined.vs.Hiseq.eval") + eval3.excludeIntervals :+= new File(pilot1_interval_list) + eval3.reportType = Some(VE2TemplateType.CSV) + + add(eval3) + } + + def runBeagle(omnivcf: File, chr: String): File = { + var beagleInput = new ProduceBeagleInput with OmniArgs + beagleInput.reference_sequence = b36_ref + beagleInput.intervalsString :+= chr + beagleInput.variantVCF = pilot1_with_na12878_vcf + beagleInput.rodBind :+= new RodBind("validation","VCF",omnivcf) + beagleInput.validation_genotype_ptrue = Some(0.99) + beagleInput.out = new File(scratch_dir+"/"+swapExt(beagleInput.variantVCF.getName,".vcf",".%s.beagle".format(chr))) + println (beagleInput.out.getAbsolutePath) + + var runBeagle : BeagleRefinement = new BeagleRefinement + runBeagle.beagleInput = beagleInput.out + runBeagle.beagleOutputBase = "Pilot1_NA12878_Beagle_with_OMNI_chr%s".format(chr) + runBeagle.beagleMemoryGigs = 6 + runBeagle.memoryLimit = Some(6) + runBeagle.beagleOutputDir = "" + runBeagle.freezeOutputs + + var gunzipPhased = new GunzipFile + gunzipPhased.gunzipMe = runBeagle.beaglePhasedFile + gunzipPhased.outFile = new File(scratch_dir+swapExt(runBeagle.beaglePhasedFile.getName,".gz","")) + + var gunzipLike = new GunzipFile + gunzipLike.gunzipMe = runBeagle.beagleLikelihoods + gunzipLike.outFile = new File(scratch_dir+swapExt(runBeagle.beagleLikelihoods.getName,".gz","")) + + + var convertBack : BeagleOutputToVCF = new BeagleOutputToVCF with OmniArgs + convertBack.reference_sequence = b36_ref + convertBack.variantVCF = pilot1_with_na12878_vcf + convertBack.intervalsString :+= chr + convertBack.rodBind :+= new RodBind("beagleR2","beagle",runBeagle.beagleRSquared) + convertBack.rodBind :+= new RodBind("beagleProbs","beagle",gunzipLike.outFile) + convertBack.rodBind :+= new RodBind("beaglePhased","beagle",gunzipPhased.outFile) + convertBack.out = new File(vcf_dir+swapExt(pilot1_with_na12878_vcf.getName,".vcf",".chr%s.beagle_refined_plus_omni.vcf".format(chr))) + + add(beagleInput,runBeagle,gunzipPhased,gunzipLike,convertBack) + + return convertBack.out + } + + class BeagleRefinement extends CommandLineFunction { + @Input(doc="The beagle input file") var beagleInput: File = _ + var beagleOutputBase: String = _ + var beagleMemoryGigs: Int = 4 + + /** + * Note: These get set + */ + @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ + @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ + @Output(doc="The beagle r2 file") var beagleRSquared: File = _ + var beagleOutputDir: String = _ + + def freezeOutputs = { + if ( beagleOutputDir == null && beagleInput.getParent == null ) { + beagleOutputDir = "" + } else if ( beagleOutputDir == null ) { + beagleOutputDir = beagleInput.getParent+"/" + } + beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz") + beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") + beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2") + } + + def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar /humgen/gsa-hpprojects/software/beagle/beagle.jar like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleInput.getAbsolutePath,beagleOutputBase) + } }