Bug fix to GenotypeConcordance. AC metrics get instantiated based on number of eval samples; if Comp has more samples, we can see AC indeces outside the bounds of the array.

Bug fix to LiftoverVariants - no barfing at reference sites.

AlleleFrequencyComparison - local changes added to make sure parsing works properly

Added HammingDistance annotation. Mostly useless. But only mostly.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4622 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-11-03 19:23:03 +00:00
parent 3d27defe93
commit 42e9987e69
7 changed files with 414 additions and 67 deletions

View File

@ -261,13 +261,16 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
qualityScoreHistograms = new QualityScoreHistograms();
}
if ( alleleCountStats == null && eval != null && validation != null ) {
alleleCountStats = new ACStats(eval,validation,Genotype.Type.values().length);
alleleCountSummary = new ACSummaryStats(eval, validation);
}
if (sampleStats == null) {
if (eval != null) {
// initialize the concordance table
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
alleleCountStats = new ACStats(eval,Genotype.Type.values().length, new CompACNames(veWalker.getLogger()));
sampleSummaryStats = new SampleSummaryStats(eval);
alleleCountSummary = new ACSummaryStats(eval, new CompACNames(veWalker.getLogger()));
for (final VariantContext vc : missedValidationData) {
determineStats(null, vc);
}
@ -319,10 +322,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
}
sampleStats.incrValue(sample, truth, called);
if ( evalAC != null ) {
if ( evalAC != null && validationAC != null) {
alleleCountStats.incrValue(evalAC,truth,called);
}
if ( validationAC != null ) {
alleleCountStats.incrValue(validationAC,truth,called);
}
}
@ -510,24 +511,21 @@ class SampleStats implements TableType {
* Sample stats, but for AC
*/
class ACStats extends SampleStats {
private final CompACNames myComp;
private String[] rowKeys;
public ACStats(VariantContext vc, int nGenotypeTypes, CompACNames comp) {
public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) {
super(nGenotypeTypes);
rowKeys = new String[2+4*vc.getGenotypes().size()];
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here...
rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()];
for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here...
concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]);
rowKeys[i] = String.format("evalAC%d",i);
}
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) {
for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) {
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
rowKeys[1+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i);
}
myComp = comp;
}
public String getName() {
@ -717,23 +715,21 @@ class SampleSummaryStats implements TableType {
* SampleSummaryStats .. but for allele counts
*/
class ACSummaryStats extends SampleSummaryStats {
final private CompACNames myComp;
private String[] rowKeys;
public ACSummaryStats (final VariantContext vc, CompACNames comp) {
public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) {
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
rowKeys = new String[3+4*vc.getGenotypes().size()];
rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()];
rowKeys[0] = ALL_SAMPLES_KEY;
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) {
concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]);
rowKeys[i+1] = String.format("evalAC%d",i);
}
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) {
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
rowKeys[2+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i);
}
myComp = comp;
}
public String getName() {

View File

@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length));
VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false);
if ( VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) {
if ( originalVC.isVariant() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) {
logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s",
originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(),
originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0)));

View File

@ -0,0 +1,111 @@
package org.broadinstitute.sting.oneoffprojects.walkers.annotator;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Oct 20, 2010
* Time: 3:08:06 PM
* To change this template use File | Settings | File Templates.
*/
public class HammingDistance implements ExperimentalAnnotation, InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
if ( tracker == null ) {
return null;
}
VariantContext hamCon = tracker.getVariantContext(ref,"hamming",null,ref.getLocus(),true);
if ( hamCon == null ) {
return null;
}
Set<String> interSamples = new HashSet<String>(vc.getSampleNames());
interSamples.retainAll(hamCon.getSampleNames());
int dist = 0;
int nrd_num = 0;
int hamCalls = 0;
int vcCallsAtHamCalls = 0;
int num_variant = 0;
for ( String s : interSamples ) {
dist += dist(vc.getGenotype(s),hamCon.getGenotype(s),true);
nrd_num += dist(vc.getGenotype(s),hamCon.getGenotype(s),false);
if ( vc.getGenotype(s).isHet() || vc.getGenotype(s).isHomVar() || hamCon.getGenotype(s).isHet() || hamCon.getGenotype(s).isHomVar() ) {
num_variant ++;
}
if ( hamCon.getGenotype(s).isCalled() ) {
hamCalls++;
if ( vc.getGenotype(s).isCalled() ) {
vcCallsAtHamCalls++;
}
}
}
HashMap<String,Object> map = new HashMap<String,Object>(1);
map.put("HMD",dist);
map.put("HCR",(0.0+vcCallsAtHamCalls)/(0.0+hamCalls));
map.put("NRD",(0.0+nrd_num)/(0.0+num_variant));
map.put("OGC",(0.0+nrd_num)/(0.0+interSamples.size()));
return map;
}
public int dist(Genotype a, Genotype b, boolean weightByAC) {
if ( a.isNoCall() || b.isNoCall() ) {
return 0;
}
if ( weightByAC ) {
if ( a.isHomRef() ) {
if ( b.isHomVar() ) {
return 2;
} else if ( b.isHet() ) {
return 1;
} else {
return 0;
}
} else if ( a.isHet() ) {
if ( b.isHom() ) {
return 1;
} else {
return 0;
}
} else {
if ( b.isHomRef() ) {
return 2;
} else if ( b.isHet() ) {
return 1;
} else {
return 0;
}
}
} else {
if ( ! a.equals(b) ) {
return 1;
} else {
return 0;
}
}
}
public List<String> getKeyNames() { return Arrays.asList("HMD","HCR","NRD","OGC"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HMD",1, VCFHeaderLineType.Integer,"The hamming distance between record in Hamming ROD and this record"),
new VCFInfoHeaderLine("HCR",1,VCFHeaderLineType.Float,"The differential call rate between record in Hamming ROD and this record"),
new VCFInfoHeaderLine("NRD",1,VCFHeaderLineType.Float,"The Non-reference discrepancy between Hamming ROD and this record"),
new VCFInfoHeaderLine("OGC",1,VCFHeaderLineType.Float,"The Overall Genotype Concordance between Hamming ROD and this one")); }
}

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -43,27 +44,38 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
if ( ! (isValidVC(eval) && isValidVC(comp)) ) {
return null;
} else {
if ( missingField(eval) ) {
// todo -- this is a godawful hack. The "right way" isn't working, so do it the unsafe way for now. Note that
// todo -- this precludes getting the AC/AF values from the info field because some may not be there...
/*if ( missingField(eval) ) {
recalculateCounts(eval);
}
if ( missingField(comp) ) {
recalculateCounts(comp);
}
afTable.update(getAF(eval),getAF(comp));
acTable.update(getAC(eval),getAC(comp));
}*/
HashMap<String,Object> evalCounts = new HashMap<String,Object>(2);
HashMap<String,Object> compCounts = new HashMap<String,Object>(2);
VariantContextUtils.calculateChromosomeCounts(eval,evalCounts,false);
VariantContextUtils.calculateChromosomeCounts(comp,compCounts,false);
afTable.update(((List<Double>)evalCounts.get("AF")).get(0),((List<Double>)compCounts.get("AF")).get(0));
acTable.update(((List<Integer>)evalCounts.get("AC")).get(0),((List<Integer>)compCounts.get("AC")).get(0));
}
return null; // there is nothing interesting
}
private static boolean missingField(final VariantContext vc) {
return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) );
}
private static void recalculateCounts(VariantContext vc) {
Map<String,Object> attributes = vc.getAttributes();
private void recalculateCounts(VariantContext vc) {
Map<String,Object> attributes = new HashMap<String,Object>();
VariantContextUtils.calculateChromosomeCounts(vc,attributes,false);
VariantContext.modifyAttributes(vc,attributes);
vc = VariantContext.modifyAttributes(vc,attributes);
getVEWalker().getLogger().debug(String.format("%s %s | %s %s",attributes.get("AC"),attributes.get("AF"),vc.getAttribute("AC"),vc.getAttribute("AF")));
if ( attributes.size() == 2 && missingField(vc) ) {
throw new org.broadinstitute.sting.utils.exceptions.StingException("VariantContext should have had attributes modified but did not");
}
}
private static boolean isValidVC(final VariantContext vc) {
@ -72,7 +84,11 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
private static double getAF(VariantContext vc) {
Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( List.class.isAssignableFrom(af.getClass())) {
if ( af == null ) {
//throw new UserException("Variant context "+vc.getName()+" does not have allele frequency entry which is required for this walker");
// still none after being re-computed; this is 0.00
return 0.00;
} else if ( List.class.isAssignableFrom(af.getClass())) {
return ( (List<Double>) af ).get(0);
} else if ( String.class.isAssignableFrom(af.getClass())) {
// two possibilities
@ -84,7 +100,7 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
return Double.parseDouble(s);
}
} catch (NumberFormatException e) {
throw new UserException("Allele frequency field may be improperly formatted, found AF="+s);
throw new UserException("Allele frequency field may be improperly formatted, found AF="+s,e);
}
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
return (Double) af;
@ -94,8 +110,11 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
}
private static int getAC(VariantContext vc) {
Object ac = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( List.class.isAssignableFrom(ac.getClass())) {
Object ac = vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
if ( ac == null ) {
// still none after being re computed; this is 0
return 0;
} else if ( List.class.isAssignableFrom(ac.getClass())) {
return ( (List<Integer>) ac ).get(0);
} else if ( String.class.isAssignableFrom(ac.getClass())) {
// two possibilities
@ -107,9 +126,9 @@ public class AlleleFrequencyComparison extends VariantEvaluator {
return Integer.parseInt(s);
}
} catch (NumberFormatException e) {
throw new UserException("Allele count field may be improperly formatted, found AC="+s);
throw new UserException(String.format("Allele count field may be improperly formatted, found AC=%s for record %s:%d",ac,vc.getChr(),vc.getStart()),e);
}
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
} else if ( Integer.class.isAssignableFrom(ac.getClass())) {
return (Integer) ac;
} else {
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass()));

View File

@ -3,10 +3,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
@ -21,6 +18,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.*;
import java.util.*;
@ -46,7 +44,8 @@ public class FixRefBases extends RodWalker<Integer,Integer> {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant"));
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
headerLines.add(new VCFHeaderLine("source", "FixRefBases"));
headerLines.add(new VCFInfoHeaderLine("FRI",1,VCFHeaderLineType.String,"The Fix-Ref-Info (if present, either \"Flipped\" or \"Fixed\")"));
out.writeHeader(new VCFHeader(headerLines,vcfSamples));
}
@ -56,7 +55,13 @@ public class FixRefBases extends RodWalker<Integer,Integer> {
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker != null && tracker.hasROD("variant") ) {
VariantContext vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true);
VariantContext vc = null;
try {
vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true);
} catch ( ReviewedStingException e ) {
logger.warn("Multiple variants found, catching exception ",e);
return 0;
}
VariantContext newContext = null;
if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) {
if ( basesAreFlipped(vc,ref) ) {
@ -66,13 +71,19 @@ public class FixRefBases extends RodWalker<Integer,Integer> {
HashSet<Allele> newAlleles = new HashSet<Allele>(vc.getAlternateAlleles());
Allele refAllele = Allele.create(ref.getBase(),true);
newAlleles.add(refAllele);
HashMap<String,Object> newAttributes = new HashMap<String,Object>(vc.getAttributes());
newAttributes.put("FRI",String.format("Fixed%s-%s",vc.getReference().toString(),refAllele));
newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(),
ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele),
vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes());
vc.isFiltered() ? vc.getFilters() : null, newAttributes);
}
if ( ! newContext.hasAttribute("FRI") ) {
throw new StingException("FRI for fixed base not propagated. vc="+vc.toString());
}
out.add(newContext,ref.getBase());
return 1;
} else {
out.add(vc,ref.getBase());
@ -100,9 +111,12 @@ public class FixRefBases extends RodWalker<Integer,Integer> {
}
private VariantContext flipBases(VariantContext vc, ReferenceContext ref) {
logger.info(String.format("Flipping bases at variant position %s:%d",vc.getChr(),vc.getStart()));
HashSet<Allele> newAlleles = new HashSet<Allele>(vc.getAlleles().size());
newAlleles.add(Allele.create(ref.getBase(),true));
newAlleles.add(Allele.create(vc.getReference().getBases()[0],false));
Map<String,Object> attribs = new HashMap<String,Object>(vc.getAttributes());
attribs.put("FRI",String.format("Flipped%s-%s",vc.getReference().toString(),Allele.create(ref.getBase(),true).toString()));
for ( Allele a : vc.getAlternateAlleles() ) {
if ( a.getBases()[0] != ref.getBase() ) {
newAlleles.add(a);
@ -112,9 +126,8 @@ public class FixRefBases extends RodWalker<Integer,Integer> {
VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(),
ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles),
vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE,
vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes());
vc.isFiltered() ? vc.getFilters() : null, attribs);
Map<String,Object> attribs = new HashMap<String,Object>(newVC.getAttributes());
VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false);
VariantContext.modifyAttributes(newVC,attribs);
return newVC;

View File

@ -29,7 +29,7 @@ public class
String extraArgs = "-L 1:1-10,000,000";
for (String tests : testsEnumerations) {
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s",
1, Arrays.asList("158ac8e6d32eb2ea1bbeebfa512965de"));
1, Arrays.asList("8a0203f0533b628ad7f1f230a43f105f"));
executeTestParallel("testSelect1", spec);
//executeTest("testSelect1", spec);
}
@ -39,7 +39,7 @@ public class
public void testSelect2() {
String extraArgs = "-L 1:1-10,000,000";
WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s",
1, Arrays.asList("cee96f61ffa1d042fe0c63550c508ec9"));
1, Arrays.asList("a55653980b750fdc8396eecb00e3b18c"));
executeTestParallel("testSelect2", spec);
//executeTest("testSelect2", spec);
}
@ -60,8 +60,8 @@ public class
@Test
public void testVESimple() {
HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "ff8c4ba16c7c14b4edbaf440f20641f9");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "9b509ce5d31658eb09bb9597799b2908");
expectations.put("-L 1:1-10,000,000", "6c9a12fdd62672b69b2ed4f0bc7e8f97");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "6a52a6454121974b71d9cfe2dad68c28");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
@ -84,10 +84,10 @@ public class
" -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String matchingMD5 = "1e6d6e152c9a90513dd5b6eaa3729388";
String matchingMD5 = "7a8183402fe29f9c62c3c3cc4d45b46e";
expectations.put("", matchingMD5);
expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5);
expectations.put(" -known comp_hapmap", "d28dd504017f39a91cde8e6f096879d6");
expectations.put(" -known comp_hapmap", "519b590635b7de035f8d5971115b47ba");
for (String tests : testsEnumerations) {
for (Map.Entry<String, String> entry : expectations.entrySet()) {
String extraArgs2 = entry.getKey();
@ -124,7 +124,7 @@ public class
for (String tests : testsEnumerations) {
WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s -NO_HEADER",
2,
Arrays.asList("6b97a019402b3984fead9a4e8b7c7c2a", "989bc30dea6c8a4cf771cd1b9fdab488"));
Arrays.asList("50321436a65ef7d574286cb0a1c55f7e", "989bc30dea6c8a4cf771cd1b9fdab488"));
executeTestParallel("testVEWriteVCF", spec);
//executeTest("testVEWriteVCF", spec);
}

View File

@ -24,17 +24,21 @@ class omni_qc extends QScript {
var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf")
var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf")
var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf")
var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/EUR.beagle.vcf.gz","vcf")
var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/ASN.beagle.vcf.gz","vcf")
var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/AFR.beagle.vcf.gz","vcf")
var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/EUR.beagle.vcf.gz","vcf")
var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/ASN.beagle.vcf.gz","vcf")
var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/bgzip_for_release/AFR.beagle.vcf.gz","vcf")
var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")
var pilot1_na12878_beagle = new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/beagle/lowpass.N60.recal.CEUTSI.bgl.output.vcf")
//var august_calls_other_genotypes = _
// OMNI VCF FILES
var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf")
var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf")
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf")
var OMNI_b36_panel_vcf = new TaggedFile("/broad/shptmp/chartl/omni/vcfs/Omni_b36_with_panel_sets.vcf","vcf")
var OMNI_b37_birdseed = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_birdseed_only.vcf")
var OMNI_b37_joint = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/OMNI_joint_birdseed_lowpass.vcf")
// INTERVALS
var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
@ -52,6 +56,14 @@ class omni_qc extends QScript {
val scratch_dir = analysis_dir + "scratch/"
val eval_dir = analysis_dir + "eval/"
val vcf_dir = analysis_dir + "vcfs/"
val p1_ceu_only = scratch_dir+"Pilot1_CEU_only_sites.intervals.list"
val p1_chbjpt_only = scratch_dir+"Pilot1_CHBJPT_only_sites.intervals.list"
val p1_yri_only = scratch_dir+"Pilot1_YRI_only_sites.intervals.list"
// OTHER CHIPS
val OMNI_QUAD_1KG = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_OMNI.ref_fixed.vcf")
val AFFY_6_0 = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/other_chips/1KG_ARRAY.ref_fixed.vcf")
trait OmniArgs extends CommandLineGATK {
this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
@ -128,38 +140,42 @@ class omni_qc extends QScript {
fix_421.variantVCF = OMNI_hapmap_b36_vcf
fix_421.reference_sequence = b36_ref
fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf"))
fix_421.bypassException = true
fix_764.variantVCF = OMNI_b36_vcf
fix_764.reference_sequence = b36_ref
fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf"))
fix_764.bypassException = true
fix_764_b37.variantVCF = OMNI_b37_vcf
fix_764_b37.reference_sequence = b37_ref
fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf"))
fix_764_b37.bypassException = true
add(fix_421,fix_764,fix_764_b37)
/** Propagate AC/AN annotations to Omni files via variant annotator **/
/** Propagate AC/AN annotations to Omni files via variant annotator **/
var annotate_421 = new VariantAnnotator with OmniArgs
var annotate_764 = new VariantAnnotator with OmniArgs
var annotate_764_b37 = new VariantAnnotator with OmniArgs
annotate_421.variantVCF = fix_421.out
annotate_421.variantVCF = OMNI_hapmap_b36_vcf
annotate_421.reference_sequence = b36_ref
annotate_421.annotation :+= "ChromosomeCounts"
annotate_421.out = new File(vcf_dir+swapExt(fix_421.out.getName,".vcf",".annot.vcf"))
annotate_764.variantVCF = fix_764.out
annotate_421.out = new File(vcf_dir+swapExt(annotate_421.variantVCF.getName,".vcf",".annot.vcf"))
annotate_764.variantVCF = OMNI_b36_vcf
annotate_764.reference_sequence = b36_ref
annotate_764.annotation :+= "ChromosomeCounts"
annotate_764.out = new File(vcf_dir+swapExt(fix_764.out.getName,".vcf",".annot.vcf"))
annotate_764_b37.variantVCF = fix_764_b37.out
annotate_764.out = new File(vcf_dir+swapExt(annotate_764.variantVCF.getName,".vcf",".annot.vcf"))
annotate_764_b37.variantVCF = OMNI_b37_vcf
annotate_764_b37.reference_sequence = b37_ref
annotate_764_b37.annotation :+= "ChromosomeCounts"
annotate_764_b37.out = new File(vcf_dir+swapExt(fix_764_b37.out.getName,".vcf",".annot.vcf"))
annotate_764_b37.out = new File(vcf_dir+swapExt(annotate_764_b37.variantVCF.getName,".vcf",".annot.vcf"))
add(annotate_421,annotate_764,annotate_764_b37)
/** Eval the omni chip against the various comps **/
runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref,true)
//runEval(OMNI_hapmap_b36_vcf,gunzip_p1_ceu.outFile,"OMNI_421_Unfixed","Pilot1_CEU",pilot1_interval_list,b36_ref)
runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref)
runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref)
runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref)
@ -170,7 +186,13 @@ class omni_qc extends QScript {
runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref)
runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref)
runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref)
runEval(annotate_764.out,OMNI_hapmap_b36_vcf,"OMNI_764","OMNI_421",pilot1_interval_list,b36_ref)
runEval(annotate_764.out,annotate_421.out,"OMNI_764","OMNI_421_FIXED",pilot1_interval_list,b36_ref)
runEval(annotate_764.out,OMNI_QUAD_1KG,"OMNI_764","OMNI_QUAD",pilot1_interval_list,b36_ref)
runEval(annotate_764.out,AFFY_6_0,"OMNI_764","AFFY_6_0",pilot1_interval_list,b36_ref)
runEval(OMNI_b37_birdseed,gunzip_ag_eur.outFile,"OMNI_birdseed","August_EUR",production_interval_list,b37_ref)
runEval(OMNI_b37_joint,gunzip_ag_eur.outFile,"OMNI_joint","August_EUR",production_interval_list,b37_ref)
runEval(OMNI_QUAD_1KG,gunzip_p1_ceu.outFile,"OMNI_QUAD_1KG","Pilot1_CEU",pilot1_interval_list,b36_ref)
runEval(AFFY_6_0,gunzip_p1_ceu.outFile,"AFFY_6_0","Pilot1_CEU",pilot1_interval_list,b36_ref)
var eval1KG_exclude = new VariantEval with OmniArgs
eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt"
@ -187,6 +209,31 @@ class omni_qc extends QScript {
runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile)
var subset421: SelectVariants = new SelectVariants with OmniArgs
subset421.reference_sequence = b36_ref
subset421.sample :+= (new File(scratch_dir+"OMNI_421_vs_Pilot1_CEU.sample_overlap.txt")).getAbsolutePath
subset421.variantVCF = annotate_764.out
subset421.out = new File(vcf_dir+swapExt(annotate_764.out.getName,".vcf",".subset.pilot1CEU.vcf"))
add(subset421)// lastly to find things in the three-way pilot venn
var combine: CombineVariants = new CombineVariants with OmniArgs
combine.reference_sequence = b36_ref
combine.rodBind :+= new RodBind("CEU","VCF",gunzip_p1_ceu.outFile)
combine.rodBind :+= new RodBind("ASN","VCF",gunzip_p1_chb.outFile)
combine.rodBind :+= new RodBind("YRI","VCF",gunzip_p1_yri.outFile)
combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
combine.priority = "%s,%s,%s".format("CEU","ASN","YRI")
combine.out = new File(vcf_dir+"Pilot1_Populations_Combined.vcf")
add(combine)
selectSites(OMNI_b36_panel_vcf,p1_ceu_only,"ceu_only_sites")
selectSites(OMNI_b36_panel_vcf,p1_chbjpt_only,"chbjpt_only_sites")
selectSites(OMNI_b36_panel_vcf,p1_yri_only,"yri_only_sites")
runBeagleAnalysis(new File(vcf_dir + "Illumina_HapMap_Omni_2.5_764samples.annot.stripped.vcf"))
}
def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = {
@ -194,7 +241,7 @@ class omni_qc extends QScript {
return Nil
}
def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File) = {
def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File, interesting: Boolean = false) = {
var base = "%s_vs_%s".format(eBase,cBase)
var getOverlap = new GetSampleOverlap
getOverlap.in_vcfs :+= eval
@ -214,6 +261,11 @@ class omni_qc extends QScript {
vEval.out = new File(eval_dir+base+".eval.csv")
if ( interesting ) {
vEval.discordantInteresting = true
vEval.outputVCF = new File(vcf_dir+"%s_vs_%s.interesting_sites.vcf".format(eBase,cBase))
}
add(vEval)
}
@ -245,7 +297,7 @@ class omni_qc extends QScript {
}
// step three -- compare the pilot 1 files against all populations and panels
runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out)
runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out)
runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out)
@ -257,6 +309,20 @@ class omni_qc extends QScript {
runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out)
runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out)
var panelCombine: CombineVariants = new CombineVariants with OmniArgs
panelCombine.reference_sequence = b36_ref
panelCombine.priority = ""
for ( p <- panels ) {
panelCombine.rodBind :+= new RodBind(p,"VCF",nameToSubset(p).out)
panelCombine.priority = if ( panelCombine.priority.equals("") ) p else panelCombine.priority + "," + p
}
panelCombine.out = OMNI_b36_panel_vcf
panelCombine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)
panelCombine.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
panelCombine.setKey = "panel"
add(panelCombine)
return true
}
@ -282,6 +348,8 @@ class omni_qc extends QScript {
eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF)
eval.noStandard = true
eval.E :+= "AlleleFrequencyComparison"
eval.reportType = Some(VE2TemplateType.CSV)
eval.out = new File(eval_dir+"%s_vs_%s_allele_frequency.eval".format(eBase,cBase))
add(eval)
@ -293,7 +361,147 @@ class omni_qc extends QScript {
combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
combine.priority = "%s,%s".format(eBase,cBase)
add(combine)
//add(combine)
}
def selectSites(vcf: File, intervals: String, base: String) {
var sv = new SelectVariants with OmniArgs
sv.reference_sequence = b36_ref
sv.variantVCF = vcf
sv.out = swapExt(vcf,".vcf",base+".vcf")
sv.intervalsString :+= intervals
add(sv)
}
def runBeagleAnalysis(omnivcf: File) {
var combine : CombineVariants = new CombineVariants with OmniArgs
combine.reference_sequence = b36_ref
for ( c <- 1 until 23) {
combine.rodBind :+= new RodBind("beagle%s".format(c),"VCF",runBeagle(omnivcf,"%s".format(c)))
if ( c > 1 ) {
combine.priority = combine.priority+",%s%s".format("beagle",c)
} else {
combine.priority = "%s%s".format("beagle",c)
}
}
combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.PRIORITIZE)
combine.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
combine.out = swapExt(pilot1_with_na12878_vcf,".vcf",".beagle_refined_with_omni.vcf")
add(combine)
var select : SelectVariants = new SelectVariants with OmniArgs
select.reference_sequence = b36_ref
select.variantVCF = combine.out
select.sample :+= "NA12878"
select.out = new File(vcf_dir + "NA12878.lowpass.beagle.refined.with.pilot1.vcf")
add(select)
var eval : VariantEval = new VariantEval with OmniArgs
eval.reference_sequence = b36_ref
eval.rodBind :+= new RodBind("evalNA12878LowPass","VCF",select.out)
eval.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf)
eval.E :+= "GenotypeConcordance"
eval.out = new File(eval_dir+"NA12878.lowpass.beagle.vs.HiSeq.eval")
eval.excludeIntervals :+= new File(pilot1_interval_list)
eval.reportType = Some(VE2TemplateType.CSV)
add(eval)
var eval2: VariantEval = new VariantEval with OmniArgs
eval2.reference_sequence = b36_ref
eval2.rodBind :+= new RodBind("evalNA12878Beagle","VCF",pilot1_na12878_beagle)
eval2.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf)
eval2.E :+= "GenotypeConcordance"
eval2.sample :+= "NA12878"
eval2.out = new File(eval_dir+"NA12878.lowpass.nochip.vs.Hiseq.eval")
eval2.excludeIntervals :+= new File(pilot1_interval_list)
eval2.reportType = Some(VE2TemplateType.CSV)
add(eval2)
var eval3: VariantEval = new VariantEval with OmniArgs
eval3.reference_sequence = b36_ref
eval3.rodBind :+= new RodBind("evalNA12878NoBeagle","VCF",pilot1_with_na12878_vcf)
eval3.rodBind :+= new RodBind("compNA12878HiSeq","VCF",hiseq_calls_vcf)
eval3.E :+= "GenotypeConcordance"
eval3.sample :+= "NA12878"
eval3.out = new File(eval_dir+"NA12878.lowpass.nochip.norefined.vs.Hiseq.eval")
eval3.excludeIntervals :+= new File(pilot1_interval_list)
eval3.reportType = Some(VE2TemplateType.CSV)
add(eval3)
}
def runBeagle(omnivcf: File, chr: String): File = {
var beagleInput = new ProduceBeagleInput with OmniArgs
beagleInput.reference_sequence = b36_ref
beagleInput.intervalsString :+= chr
beagleInput.variantVCF = pilot1_with_na12878_vcf
beagleInput.rodBind :+= new RodBind("validation","VCF",omnivcf)
beagleInput.validation_genotype_ptrue = Some(0.99)
beagleInput.out = new File(scratch_dir+"/"+swapExt(beagleInput.variantVCF.getName,".vcf",".%s.beagle".format(chr)))
println (beagleInput.out.getAbsolutePath)
var runBeagle : BeagleRefinement = new BeagleRefinement
runBeagle.beagleInput = beagleInput.out
runBeagle.beagleOutputBase = "Pilot1_NA12878_Beagle_with_OMNI_chr%s".format(chr)
runBeagle.beagleMemoryGigs = 6
runBeagle.memoryLimit = Some(6)
runBeagle.beagleOutputDir = ""
runBeagle.freezeOutputs
var gunzipPhased = new GunzipFile
gunzipPhased.gunzipMe = runBeagle.beaglePhasedFile
gunzipPhased.outFile = new File(scratch_dir+swapExt(runBeagle.beaglePhasedFile.getName,".gz",""))
var gunzipLike = new GunzipFile
gunzipLike.gunzipMe = runBeagle.beagleLikelihoods
gunzipLike.outFile = new File(scratch_dir+swapExt(runBeagle.beagleLikelihoods.getName,".gz",""))
var convertBack : BeagleOutputToVCF = new BeagleOutputToVCF with OmniArgs
convertBack.reference_sequence = b36_ref
convertBack.variantVCF = pilot1_with_na12878_vcf
convertBack.intervalsString :+= chr
convertBack.rodBind :+= new RodBind("beagleR2","beagle",runBeagle.beagleRSquared)
convertBack.rodBind :+= new RodBind("beagleProbs","beagle",gunzipLike.outFile)
convertBack.rodBind :+= new RodBind("beaglePhased","beagle",gunzipPhased.outFile)
convertBack.out = new File(vcf_dir+swapExt(pilot1_with_na12878_vcf.getName,".vcf",".chr%s.beagle_refined_plus_omni.vcf".format(chr)))
add(beagleInput,runBeagle,gunzipPhased,gunzipLike,convertBack)
return convertBack.out
}
class BeagleRefinement extends CommandLineFunction {
@Input(doc="The beagle input file") var beagleInput: File = _
var beagleOutputBase: String = _
var beagleMemoryGigs: Int = 4
/**
* Note: These get set
*/
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
var beagleOutputDir: String = _
def freezeOutputs = {
if ( beagleOutputDir == null && beagleInput.getParent == null ) {
beagleOutputDir = ""
} else if ( beagleOutputDir == null ) {
beagleOutputDir = beagleInput.getParent+"/"
}
beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz")
beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2")
}
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar /humgen/gsa-hpprojects/software/beagle/beagle.jar like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleInput.getAbsolutePath,beagleOutputBase)
}
}