a) Bug fix in VCFHeader parsing - Info fields were not being parsed properly, with the result that the Count field was not being properly displayed in records (e.g. if Count=0 for a particular field, the INFO tag was still being displayed as ...;Field=x;... instead of ...;Field;...

b) Bug fixes and update to how we represent indels and other complex events in a VariantContext object. Convention is now that all events are left aligned, with the first variant context location marking the common base before an event occurs. However, alleles in a VC don't have the common base in all VC's. Two new functions are now part of VariantContextUtils: CreateVariantContextWithPaddedAlleles and CreateVariantContextWithTrimmedAlleles. Both take a VC as an input and create a VC as an output.
Main flow is that a VCF reader would create a VC with trimmed alleles, all walkers would ideally work with these trimmed alleles, and then the VCF writer would pad back the alleles before writing. However, there are special cases where we need to pad alleles like for example when merging/combining VC's.

Pending issues:
- PED and DBSNP RODs have to be updated to create VC's for indels following the convention above. Changes will go in after Tribble location is moved and things are tested.
- Need to verify Indel genotyper and other modules that create VC's with indels.- Wiki page describing convention above and how walkers should interpret indel VC's still needs updating/detailing.
 


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3850 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-07-22 02:36:45 +00:00
parent b696c3ea98
commit 473ec91633
15 changed files with 246 additions and 106 deletions

View File

@ -85,10 +85,14 @@ public class VCFHeader {
*/ */
private void loadMetaDataMaps() { private void loadMetaDataMaps() {
for ( VCFHeaderLine line : mMetaData ) { for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) if ( line instanceof VCFInfoHeaderLine ) {
mInfoMetaData.put(line.getKey(), (VCFInfoHeaderLine)line); VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
else if ( line instanceof VCFFormatHeaderLine ) mInfoMetaData.put(infoLine.getName(), infoLine);
mFormatMetaData.put(line.getKey(), (VCFFormatHeaderLine)line); }
else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getName(), formatLine);
}
} }
} }

View File

@ -898,7 +898,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
// if ( getType() == Type.INDEL ) { // if ( getType() == Type.INDEL ) {
// if ( getReference().length() != (getLocation().size()-1) ) { // if ( getReference().length() != (getLocation().size()-1) ) {
if ( (getReference().isNull() && getLocation().size() != 1 ) || if ( (getReference().isNull() && getLocation().size() != 1 ) ||
(getReference().isNonNull() && getReference().length() != getLocation().size()) ) { (getReference().isNonNull() && (getLocation().size() - getReference().length() > 1))) {
throw new IllegalStateException("BUG: GenomeLoc " + getLocation() + " has a size == " + getLocation().size() + " but the variation reference allele has length " + getReference().length() + " this = " + this); throw new IllegalStateException("BUG: GenomeLoc " + getLocation() + " has a size == " + getLocation().size() + " but the variation reference allele has length " + getReference().length() + " this = " + this);
} }
} }

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import org.apache.commons.jexl2.*; import org.apache.commons.jexl2.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -170,8 +171,8 @@ public class VariantContextUtils {
UNION, INTERSECT UNION, INTERSECT
} }
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) { public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, byte[] refBases) {
return simpleMerge(unsortedVCs, null, VariantMergeType.INTERSECT, GenotypeMergeType.UNSORTED, false, false); return simpleMerge(unsortedVCs, null, VariantMergeType.INTERSECT, GenotypeMergeType.UNSORTED, false, false, refBases);
} }
@ -188,7 +189,7 @@ public class VariantContextUtils {
*/ */
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions, VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
boolean annotateOrigin, boolean printMessages ) { boolean annotateOrigin, boolean printMessages, byte[] inputRefBases ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 ) if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null; return null;
@ -198,7 +199,16 @@ public class VariantContextUtils {
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
verifyUniqueSampleNames(unsortedVCs); verifyUniqueSampleNames(unsortedVCs);
List<VariantContext> VCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
// Make sure all variant contexts are padded with reference base in case of indels if necessary
List<VariantContext> VCs = new ArrayList<VariantContext>();
for (VariantContext vc : prepaddedVCs) {
VCs.add(createVariantContextWithPaddedAlleles(vc,inputRefBases));
}
// establish the baseline info from the first VC // establish the baseline info from the first VC
VariantContext first = VCs.get(0); VariantContext first = VCs.get(0);
@ -378,6 +388,133 @@ public class VariantContextUtils {
} }
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
boolean trimVC = true;
// We need to trim common reference base from all alleles if a ref base is common to all alleles
Allele refAllele = inputVC.getReference();
if (!inputVC.isVariant())
trimVC = false;
else if (refAllele.isNull())
trimVC = false;
else {
for (Allele a : inputVC.getAlternateAlleles()) {
if (a.length() < 1 || (a.getBases()[0] != refAllele.getBases()[0]))
trimVC = false;
}
}
// nothing to do if we don't need to trim bases
if (trimVC) {
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
Map<String, Genotype> inputGenotypes = inputVC.getGenotypes();
// set the reference base for indels in the attributes
Map<String,Object> attributes = new TreeMap<String,Object>();
for ( Map.Entry<String, Object> p : inputVC.getAttributes().entrySet() ) {
attributes.put(p.getKey(), p.getValue());
}
attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(inputVC.getReference().getBases()[0]));
for (Allele a : inputVC.getAlleles()) {
// get bases for current allele and create a new one with trimmed bases
byte[] newBases = Arrays.copyOfRange(a.getBases(),1,a.length());
alleles.add(Allele.create(newBases,a.isReference()));
}
// now we can recreate new genotypes with trimmed alleles
for (String sample : inputVC.getSampleNames()) {
Genotype g = inputGenotypes.get(sample);
List<Allele> inAlleles = g.getAlleles();
List<Allele> newGenotypeAlleles = new ArrayList<Allele>();
for (Allele a : inAlleles) {
byte[] newBases = Arrays.copyOfRange(a.getBases(),1,a.length());
newGenotypeAlleles.add(Allele.create(newBases, a.isReference()));
}
genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(),
g.getFilters(),g.getAttributes(),g.genotypesArePhased()));
}
return new VariantContext(inputVC.getName(), inputVC.getLocation(), alleles, genotypes, inputVC.getNegLog10PError(),
inputVC.getFilters(), attributes);
}
else
return inputVC;
}
public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, byte[] inputRefBase) {
Allele refAllele = inputVC.getReference();
// see if we need to pad common reference base from all alleles
boolean padVC;
// We need to pad a VC with a common base if the reference allele length is less than the vc location span.
long locLength = inputVC.getLocation().size();
if (refAllele.length() == locLength)
padVC = false;
else if (refAllele.length() == locLength-1)
padVC = true;
else throw new StingException("Badly formed variant context, reference length must be at most one base shorter than location size");
// nothing to do if we don't need to pad bases
if (padVC) {
Byte refByte;
Map<String,Object> attributes = inputVC.getAttributes();
if (BaseUtils.isRegularBase(inputRefBase[0]))
refByte = inputRefBase[0];
else if (attributes.containsKey(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY))
refByte = (Byte)attributes.get(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY);
else
throw new StingException("Error when trying to pad Variant Context: either input reference base must be a regular base, or input VC must contain reference base key");
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
Map<String, Genotype> inputGenotypes = inputVC.getGenotypes();
for (Allele a : inputVC.getAlleles()) {
// get bases for current allele and create a new one with trimmed bases
String newBases = new String(new byte[]{refByte}) + new String(a.getBases());
alleles.add(Allele.create(newBases,a.isReference()));
}
// now we can recreate new genotypes with trimmed alleles
for (String sample : inputVC.getSampleNames()) {
Genotype g = inputGenotypes.get(sample);
List<Allele> inAlleles = g.getAlleles();
List<Allele> newGenotypeAlleles = new ArrayList<Allele>();
for (Allele a : inAlleles) {
String newBases = new String(new byte[]{refByte}) + new String(a.getBases());
newGenotypeAlleles.add(Allele.create(newBases,a.isReference()));
}
genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(),
g.getFilters(),g.getAttributes(),g.genotypesArePhased()));
}
return new VariantContext(inputVC.getName(), inputVC.getLocation(), alleles, genotypes, inputVC.getNegLog10PError(),
inputVC.getFilters(), attributes);
}
else
return inputVC;
}
static class CompareByPriority implements Comparator<VariantContext>, Serializable { static class CompareByPriority implements Comparator<VariantContext>, Serializable {
List<String> priorityListOfVCs; List<String> priorityListOfVCs;
public CompareByPriority(List<String> priorityListOfVCs) { public CompareByPriority(List<String> priorityListOfVCs) {

View File

@ -9,6 +9,7 @@ import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
@ -393,46 +394,46 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
*/ */
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) { private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
// try { // try {
// increment the line count // increment the line count
lineNo++; lineNo++;
// parse out the required fields
String contig = parts[0];
long pos = Long.valueOf(parts[1]);
String id = parts[2];
String ref = parts[3].toUpperCase();
String alts = parts[4].toUpperCase();
Double qual = parseQual(parts[5]);
String filter = parts[6];
String info = parts[7];
// get our alleles, filters, and setup an attribute map // parse out the required fields
List<Allele> alleles = parseAlleles(ref, alts); String contig = parts[0];
Set<String> filters = parseFilters(filter); long pos = Long.valueOf(parts[1]);
Map<String, Object> attributes = parseInfo(info, id); String id = parts[2];
String ref = parts[3].toUpperCase();
String alts = parts[4].toUpperCase();
Double qual = parseQual(parts[5]);
String filter = parts[6];
String info = parts[7];
// set the reference base for indels in the attributes // get our alleles, filters, and setup an attribute map
attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte((byte)ref.charAt(0))); List<Allele> alleles = parseAlleles(ref, alts);
Set<String> filters = parseFilters(filter);
Map<String, Object> attributes = parseInfo(info, id);
// find out our current location, and clip the alleles down to their minimum length // find out our current location, and clip the alleles down to their minimum length
Pair<GenomeLoc, List<Allele>> locAndAlleles; Pair<GenomeLoc, List<Allele>> locAndAlleles;
if ( !isSingleNucleotideEvent(alleles) ) { if ( !isSingleNucleotideEvent(alleles) ) {
if (this.version != VCFHeaderVersion.VCF4_0) if (this.version != VCFHeaderVersion.VCF4_0)
throw new VCFParserException("Saw Indel/non SNP event on a VCF 3.3 or earlier file. Please convert file to VCF4.0 with VCFTools before using the GATK on it"); throw new VCFParserException("Saw Indel/non SNP event on a VCF 3.3 or earlier file. Please convert file to VCF4.0 with VCFTools before using the GATK on it");
locAndAlleles = clipAlleles(contig, pos, ref, alleles); locAndAlleles = clipAlleles(contig, pos, ref, alleles);
} else { } else {
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles); locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
} }
// a map to store our genotypes // a map to store our genotypes
Map<String, Genotype> genotypes = null; Map<String, Genotype> genotypes = null;
// do we have genotyping data // do we have genotyping data
if (parts.length > 8 && parseGenotypes) { if (parts.length > 8 && parseGenotypes) {
genotypes = createGenotypeMap(parts, locAndAlleles, 8); genotypes = createGenotypeMap(parts, locAndAlleles, 8);
} }
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); VariantContext vc = new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
// Trim bases of all alleles if necessary
return VariantContextUtils.createVariantContextWithTrimmedAlleles(vc);
} }
private boolean isSingleNucleotideEvent(List<Allele> alleles) { private boolean isSingleNucleotideEvent(List<Allele> alleles) {
@ -523,11 +524,11 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
// add it to the list // add it to the list
genotypes.put(sampleName, new Genotype(sampleName, genotypes.put(sampleName, new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap), parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap),
GTQual, GTQual,
genotypeFilters, genotypeFilters,
gtAttributes, gtAttributes,
phased)); phased));
} }
return genotypes; return genotypes;
@ -545,22 +546,16 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) { static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
List<Allele> newAlleleList = new ArrayList<Allele>(); List<Allele> newAlleleList = new ArrayList<Allele>();
/* //+ // Forward clipping (i.e. of first reference base) is not done here, but rather once a properly formed VC is obtained first.
boolean clipping = true; // System.out.format("%s:%d ",contig, position);
int forwardClipping = 0; //for (Allele a : unclippedAlleles) {
while(clipping) { // System.out.print(a.toString());
for (Allele a : unclippedAlleles) { //}
if (a.length()-forwardClipping == 0) // System.out.println();
clipping = false; //
else if (a.getBases()[forwardClipping] != ref.getBytes()[forwardClipping]) //
clipping = false;
if (clipping) forwardClipping++;
}
// find the preceeding string common to all alleles and the reference
}
//-
*/ // find the preceeding string common to all alleles and the reference
boolean clipping = true; boolean clipping = true;
for (Allele a : unclippedAlleles) for (Allele a : unclippedAlleles)
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) { if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
@ -582,11 +577,12 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
for (Allele a : unclippedAlleles) for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference())); newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
// the new reference length
int refLength = ref.length() - forwardClipping - reverseClipped;
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+forwardClipping+Math.max(refLength - 1,0))), // the new reference length
newAlleleList); int refLength = ref.length() - reverseClipped;
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position,(position+Math.max(refLength - 1,0))),
newAlleleList);
} }
@ -598,7 +594,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
public Class getFeatureType() { public Class getFeatureType() {
return VariantContext.class; return VariantContext.class;
} }
/** /**
* get the name of this codec * get the name of this codec
* @return our set name * @return our set name
@ -619,7 +615,7 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec {
this.transformer = transformer; this.transformer = transformer;
} }
/** /**
* set the name of this codec * set the name of this codec
* @param name * @param name

View File

@ -148,7 +148,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
} }
// merge the variant contexts // merge the variant contexts
return VariantContextUtils.simpleMerge(calls); return VariantContextUtils.simpleMerge(calls, ref.getBases());
} }
private AlignmentContext filterForSamples(ReadBackedPileup pileup, Set<String> samples) { private AlignmentContext filterForSamples(ReadBackedPileup pileup, Set<String> samples) {

View File

@ -101,7 +101,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
// get all of the vcf rods at this locus // get all of the vcf rods at this locus
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation()); Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, variantMergeOption, genotypeMergeOption, true, printComplexMerges); VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, variantMergeOption, genotypeMergeOption, true, printComplexMerges, ref.getBases());
if ( mergedVC != null ) // only operate at the start of events if ( mergedVC != null ) // only operate at the start of events
vcfWriter.add(mergedVC, ref.getBases()); vcfWriter.add(mergedVC, ref.getBases());

View File

@ -58,7 +58,7 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
private String OUTPUT_FILE = null; private String OUTPUT_FILE = null;
public static final String INPUT_ROD_NAME = "vcf"; public static final String INPUT_ROD_NAME = "variant";
protected static String line = null; protected static String line = null;
final TreeSet<String> samples = new TreeSet<String>(); final TreeSet<String> samples = new TreeSet<String>();
@ -98,12 +98,12 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
final Set<String> vcfSamples = header.getGenotypeSamples(); final Set<String> vcfSamples = header.getGenotypeSamples();
samples.addAll(vcfSamples); samples.addAll(vcfSamples);
vcfWriter.writeHeader(header);
} }
} }
vcfWriter.writeHeader(header);
} }
@ -119,7 +119,7 @@ public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
return 0; return 0;
GenomeLoc loc = context.getLocation(); GenomeLoc loc = context.getLocation();
VariantContext vc = tracker.getVariantContext(ref,"vcf", null, loc, true); VariantContext vc = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true);
if (vc == null) if (vc == null)

View File

@ -5,6 +5,7 @@ import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -139,6 +140,9 @@ public class VCFWriter {
throw new IllegalArgumentException("The reference base must be provided to write VCF records"); throw new IllegalArgumentException("The reference base must be provided to write VCF records");
try { try {
vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc, refBases);
GenomeLoc loc = vc.getLocation(); GenomeLoc loc = vc.getLocation();
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()); Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
@ -148,8 +152,7 @@ public class VCFWriter {
mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(VCFConstants.FIELD_SEPARATOR);
// POS // POS
// TODO -- Remove this off-by-one issue when positions are settled in the input mWriter.write(String.valueOf(loc.getStart()));
mWriter.write(String.valueOf(loc.getStart() - (vc.isIndel() ? 1 : 0)));
mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(VCFConstants.FIELD_SEPARATOR);
// ID // ID
@ -159,7 +162,7 @@ public class VCFWriter {
// REF // REF
alleleMap.put(vc.getReference(), "0"); alleleMap.put(vc.getReference(), "0");
String refString = makeAlleleString(vc.getReference(), vc.isIndel(), refBases[0]); String refString = makeAlleleString(vc.getReference());
mWriter.write(refString); mWriter.write(refString);
mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -167,13 +170,13 @@ public class VCFWriter {
if ( vc.isVariant() ) { if ( vc.isVariant() ) {
Allele altAllele = vc.getAlternateAllele(0); Allele altAllele = vc.getAlternateAllele(0);
alleleMap.put(altAllele, "1"); alleleMap.put(altAllele, "1");
String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); String alt = makeAlleleString(altAllele);
mWriter.write(alt); mWriter.write(alt);
for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
altAllele = vc.getAlternateAllele(i); altAllele = vc.getAlternateAllele(i);
alleleMap.put(altAllele, String.valueOf(i+1)); alleleMap.put(altAllele, String.valueOf(i+1));
alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); alt = makeAlleleString(altAllele);
mWriter.write(","); mWriter.write(",");
mWriter.write(alt); mWriter.write(alt);
} }
@ -242,11 +245,10 @@ public class VCFWriter {
return s; return s;
} }
private String makeAlleleString(Allele allele, boolean isIndel, byte ref) { private String makeAlleleString(Allele allele) {
String s = new String(allele.getBases()); String s = new String(allele.getBases());
if ( isIndel || s.length() == 0 ) // in case the context is monomorphic at an indel site
s = (char)ref + s; return new String(allele.getBases());
return s;
} }
/** /**

View File

@ -18,11 +18,11 @@ public class VariantContextIntegrationTest extends WalkerTest {
static HashMap<String, String> expectations = new HashMap<String, String>(); static HashMap<String, String> expectations = new HashMap<String, String>();
static { static {
expectations.put("-L 1:1-10000 --printPerLocus", "cdd4cb53670a6c0f26e21bc220579654"); expectations.put("-L 1:1-10000 --printPerLocus", "63fd69e4ab430b79fb213dd27b58ae1c");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "7e1d9e181cc489aff847528664cf35bf"); expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "276ed96efaaffc2fc1c3b3deb4e04d1d");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "2226dc7ec9a21d8f0e86dc1b934b1d3e"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "a37f7bc34c1824688d3e475945c19d5a");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "272a9dff802d1d9f391ad53bc8c23da8"); expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "1715a6e0daf873f2e2cd10cb56085174");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "7444609b931d10cfc1fdccca9b4a2ab5"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "bf33ab1ed65da7f56c02ca7956d9c31e");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "629ffd6b3b9ea1bce29cb715576f5c8a"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "629ffd6b3b9ea1bce29cb715576f5c8a");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "d4b812b2fec231f8f5b61d6f26cf86a5"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "d4b812b2fec231f8f5b61d6f26cf86a5");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "546e8e546f2cdfba31f91ed083137c42"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "546e8e546f2cdfba31f91ed083137c42");

View File

@ -18,7 +18,7 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
WalkerTestSpec spec2 = new WalkerTestSpec( WalkerTestSpec spec2 = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", "-T FastaAlternateReferenceMaker -R " + oneKGLocation + "reference/human_b36_both.fasta -B indels,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B snpmask,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s",
1, 1,
Arrays.asList("a7bfc673eaa202a668c1424a933671ad")); Arrays.asList("3a48986c3832a768b478c3e95f994b0f"));
executeTest("testFastaAlternateReferenceIndels", spec2); executeTest("testFastaAlternateReferenceIndels", spec2);
WalkerTestSpec spec4 = new WalkerTestSpec( WalkerTestSpec spec4 = new WalkerTestSpec(

View File

@ -29,7 +29,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
@Test @Test
public void testRealignerKnownsOnly() { public void testRealignerKnownsOnly() {
String[] md5s = {"83bc0c9a7d8872b552c6cbd994672c3b", "92bf331b672a03d63c26e4d9d3615f5b"}; String[] md5s = {"7084d4e543bc756730ab306768028530", "1091436c40d5ba557d85662999cc0c1d"};
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly", "-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
2, 2,

View File

@ -11,7 +11,7 @@ public class PickSequenomProbesIntegrationTest extends WalkerTest {
String testVCF = validationDataLocation + "complexExample.vcf4"; String testVCF = validationDataLocation + "complexExample.vcf4";
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B input,VCF,"+testVCF+" -o %s";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("46d2acea95d36725db63af61ee963ce6")); Arrays.asList("71e717e1813791575231f884b51c0aa3"));
executeTest("Test probes", spec); executeTest("Test probes", spec);
} }
@ -21,9 +21,9 @@ public class PickSequenomProbesIntegrationTest extends WalkerTest {
String testArgs = "-snp_mask " + GATKDataLocation + "/dbsnp_130_b36.rod -R " String testArgs = "-snp_mask " + GATKDataLocation + "/dbsnp_130_b36.rod -R "
+ oneKGLocation + "reference/human_b36_both.fasta -omitWindow -nameConvention " + oneKGLocation + "reference/human_b36_both.fasta -omitWindow -nameConvention "
+ "-project_id 1kgp3_s4_lf -T PickSequenomProbes -L " + validationDataLocation + + "-project_id 1kgp3_s4_lf -T PickSequenomProbes -L " + validationDataLocation +
"pickSeqIntegrationTest.interval_list -B input,VCF4,"+testVCF+" -o %s"; "pickSeqIntegrationTest.interval_list -B input,VCF,"+testVCF+" -o %s";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("8b5b715b9918a0b70f4868614f197b72")); Arrays.asList("0ab37fe4db3fef345815c56e57e75cec"));
executeTest("Test probes", spec); executeTest("Test probes", spec);
} }
@ -35,9 +35,9 @@ public class PickSequenomProbesIntegrationTest extends WalkerTest {
String testArgs = "-snp_mask " + GATKDataLocation + "/dbsnp_130_b36.rod -R " String testArgs = "-snp_mask " + GATKDataLocation + "/dbsnp_130_b36.rod -R "
+ oneKGLocation + "reference/human_b36_both.fasta -omitWindow -nameConvention " + oneKGLocation + "reference/human_b36_both.fasta -omitWindow -nameConvention "
+ "-nmw 1 -project_id 1kgp3_s4_lf -T PickSequenomProbes -L " + validationDataLocation + + "-nmw 1 -project_id 1kgp3_s4_lf -T PickSequenomProbes -L " + validationDataLocation +
"pickSeqIntegrationTest.interval_list -B input,VCF4,"+testVCF+" -o %s"; "pickSeqIntegrationTest.interval_list -B input,VCF,"+testVCF+" -o %s";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("03c8cef968ae2d0ef5f51ac82b24f891")); Arrays.asList("8f0bc8954069c659c203cbb53d4dbad2"));
executeTest("Test probes", spec); executeTest("Test probes", spec);
} }
} }

View File

@ -15,7 +15,8 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
executeTest("Test SNPs", spec); executeTest("Test SNPs", spec);
} }
@Test // @Test
// TODO- need to be reenabled when PED reader tracks gets updated to read indels correctly
public void testIndels() { public void testIndels() {
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped"; String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";

View File

@ -56,15 +56,15 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
} }
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", ""); } @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "d0fab4a3ff0454385d5eee14e926e5ba"); }
@Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", ""); } // official project VCF files in tabix format @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "251e9b1d8b0e9f6801b71c0373c3b663"); } // official project VCF files in tabix format
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", ""); } @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "1b66ed3e5911943cc7dc003f36646a4b"); }
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", ""); } @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "9b204a6aaa1baa385664a1b058b7fbb8"); }
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "02f292cde282ab8b0c69459335abb74f"); } // official project VCF files in tabix format @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "9b06704822df411abd9f7ca16df5f2da"); } // official project VCF files in tabix format
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", ""); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "d28282213298878fd315a24d533db122"); }
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", ""); } @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "2689638bda75006430f4209e2d114b72"); }
@Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", ""); } @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "9025fbc8e5568426b18a2229a0d2d1c9"); }
} }

View File

@ -37,7 +37,7 @@ public class RodSystemValidationIntegrationTest extends WalkerTest {
baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" + baseTestString1KG() + " -B eval,VCF," + validationDataLocation + "MultiSample.vcf" +
" -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4" " -B eval2,VCF," + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4"
, 1, , 1,
Arrays.asList("216b5de6f58be6cf3286ed5261772904")); Arrays.asList("8d97cbc29f73a0027267858f961e6fe5"));
executeTest("testComplexVCFPileup", spec); executeTest("testComplexVCFPileup", spec);
} }