Nearly finalize merging capabilities for CombineVariants. Support for dealing with inconsistent indel alleles at loci. Improvements to Allele and removal of addAllele to MutableGenotype. We are close to being able to merge all of 1000 genomes -- snps and indels -- into a single combined vcf

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3710 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-02 13:32:33 +00:00
parent cab8394103
commit 61e2b2e39b
7 changed files with 144 additions and 26 deletions

View File

@ -128,7 +128,7 @@ public class VCFGenotypeEncoding {
*/ */
private static boolean validBases(String bases) { private static boolean validBases(String bases) {
for (char c : bases.toUpperCase().toCharArray()) { for (char c : bases.toUpperCase().toCharArray()) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N')
return false; return false;
} }
return true; return true;

View File

@ -168,6 +168,19 @@ public class Allele implements Comparable<Allele> {
return create( new byte[]{ base }, isRef); return create( new byte[]{ base }, isRef);
} }
public static Allele extend(Allele left, byte[] right) {
byte[] bases = null;
if ( left.length() == 0 )
bases = right;
else {
bases = new byte[left.length() + right.length];
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
System.arraycopy(right, 0, bases, left.length(), right.length);
}
return create(bases, left.isReference());
}
/** /**
* @param bases bases representing an allele * @param bases bases representing an allele
* @return true if the bases represent the null allele * @return true if the bases represent the null allele

View File

@ -39,7 +39,7 @@ public class MutableGenotype extends Genotype {
* @param alleles list of alleles * @param alleles list of alleles
*/ */
public void setAlleles(List<Allele> alleles) { public void setAlleles(List<Allele> alleles) {
this.alleles.clear(); this.alleles = new ArrayList<Allele>(alleles);
// todo -- add validation checking here // todo -- add validation checking here
@ -51,14 +51,8 @@ public class MutableGenotype extends Genotype {
if ( nNoCalls > 0 && nNoCalls != alleles.size() ) if ( nNoCalls > 0 && nNoCalls != alleles.size() )
throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
for ( Allele allele : alleles ) { for ( Allele allele : alleles )
addAllele(allele); if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype");
}
}
public void addAllele(Allele allele) {
if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype");
this.alleles.add(allele);
} }
// --------------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------------

View File

@ -237,7 +237,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @param attributes * @param attributes
*/ */
public VariantContext(String name, GenomeLoc loc, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) { public VariantContext(String name, GenomeLoc loc, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new HashMap<String, Genotype>(), genotypes) : null, negLog10PError, filters, attributes); this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap<String, Genotype>(), genotypes) : null, negLog10PError, filters, attributes);
} }
/** /**

View File

@ -196,23 +196,33 @@ public class VariantContextUtils {
Set<Allele> alleles = new TreeSet<Allele>(); Set<Allele> alleles = new TreeSet<Allele>();
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>(); Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
double negLog10PError = -1; double negLog10PError = -1;
Set<String> filters = new HashSet<String>(); Set<String> filters = new TreeSet<String>();
Map<String, String> attributes = new TreeMap<String, String>(); Map<String, Object> attributes = new TreeMap<String, Object>(first.getAttributes());
int depth = 0; int depth = 0;
// filtering values // filtering values
int nFiltered = 0; int nFiltered = 0;
Allele refAllele = determineReferenceAllele(VCs);
boolean remapped = false;
// cycle through and add info from the other VCs, making sure the loc/reference matches // cycle through and add info from the other VCs, making sure the loc/reference matches
for ( VariantContext vc : VCs ) { for ( VariantContext vc : VCs ) {
if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) ) if ( loc.getStart() != vc.getLocation().getStart() ) // || !first.getReference().equals(vc.getReference()) )
throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
if ( vc.getLocation().size() > loc.size() )
loc = vc.getLocation(); // get the longest location
nFiltered += vc.isFiltered() ? 1 : 0; nFiltered += vc.isFiltered() ? 1 : 0;
alleles.addAll(vc.getAlleles()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles);
remapped = remapped || alleleMapping.needsRemapping();
mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); alleles.addAll(alleleMapping.values());
mergeGenotypes(genotypes, vc, alleleMapping, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));
negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1);
@ -244,9 +254,92 @@ public class VariantContextUtils {
if ( depth > 0 ) if ( depth > 0 )
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
//if ( remapped ) System.out.printf("Remapped => %s%n", merged);
return merged;
} }
private static class AlleleMapper {
private VariantContext vc = null;
private Map<Allele, Allele> map = null;
public AlleleMapper(VariantContext vc) { this.vc = vc; }
public AlleleMapper(Map<Allele, Allele> map) { this.map = map; }
public boolean needsRemapping() { return this.map != null; }
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; }
public List<Allele> remap(List<Allele> as) {
List<Allele> newAs = new ArrayList<Allele>();
for ( Allele a : as ) {
//System.out.printf(" Remapping %s => %s%n", a, remap(a));
newAs.add(remap(a));
}
return newAs;
}
}
static private Allele determineReferenceAllele(List<VariantContext> VCs) {
Allele ref = null;
for ( VariantContext vc : VCs ) {
Allele myRef = vc.getReference();
if ( ref == null || ref.length() < myRef.length() )
ref = myRef;
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
throw new StingException("BUG: equal length references with difference bases: "+ ref + " " + myRef);
}
return ref;
}
static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) {
if ( refAllele.equals(vc.getReference()) )
return new AlleleMapper(vc);
else {
// we really need to do some work. The refAllele is the longest reference allele seen at this
// start site. So imagine it is:
//
// refAllele: ACGTGA
// myRef: ACGT
// myAlt: -
//
// We need to remap all of the alleles in vc to include the extra GA so that
// myRef => refAllele and myAlt => GA
//
Allele myRef = vc.getReference();
if ( refAllele.length() <= myRef.length() ) throw new StingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
// System.out.printf("Remapping allele at %s%n", vc);
// System.out.printf("ref %s%n", refAllele);
// System.out.printf("myref %s%n", myRef );
// System.out.printf("extrabases %s%n", new String(extraBases));
Map<Allele, Allele> map = new HashMap<Allele, Allele>();
for ( Allele a : vc.getAlleles() ) {
if ( a.isReference() )
map.put(a, refAllele);
else {
Allele extended = Allele.extend(a, extraBases);
for ( Allele b : allAlleles )
if ( extended.equals(b) )
extended = b;
// System.out.printf(" Extending %s => %s%n", a, extended);
map.put(a, extended);
}
}
// debugging
// System.out.printf("mapping %s%n", map);
return new AlleleMapper(map);
}
}
static class CompareByPriority implements Comparator<VariantContext> { static class CompareByPriority implements Comparator<VariantContext> {
List<String> priorityListOfVCs; List<String> priorityListOfVCs;
public CompareByPriority(List<String> priorityListOfVCs) { public CompareByPriority(List<String> priorityListOfVCs) {
@ -277,12 +370,19 @@ public class VariantContextUtils {
} }
} }
private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) { private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) {
for ( Genotype g : oneVC.getGenotypes().values() ) { for ( Genotype g : oneVC.getGenotypes().values() ) {
String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples); String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples);
if ( ! mergedGenotypes.containsKey(name) ) { if ( ! mergedGenotypes.containsKey(name) ) {
// only add if the name is new // only add if the name is new
Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g); Genotype newG = g;
if ( uniqifySamples || alleleMapping.needsRemapping() ) {
MutableGenotype mutG = new MutableGenotype(name, g);
if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles()));
newG = mutG;
}
mergedGenotypes.put(name, newG); mergedGenotypes.put(name, newG);
} }
} }

View File

@ -69,7 +69,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
//Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); //Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
//hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); //hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
vcfWriter = new VCFWriter(out); vcfWriter = new VCFWriter(out, true);
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(","))); priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
validateAnnotateUnionArguments(priority); validateAnnotateUnionArguments(priority);
@ -82,7 +82,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
} }
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) { private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
Set<String> samples = new HashSet<String>(); Set<String> samples = new TreeSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) { for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue(); VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) { for ( String sample : header.getGenotypeSamples() ) {
@ -110,10 +110,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation()); Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true); VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
if ( mergedVC != null ) // only operate at the start of events if ( mergedVC != null ) // only operate at the start of events
if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed //if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
vcfWriter.add(mergedVC, ref.getBases()); vcfWriter.add(mergedVC, ref.getBases());
else // else
logger.info(String.format("Ignoring complex event: " + mergedVC)); // logger.info(String.format("Ignoring complex event: " + mergedVC));
return vcs.isEmpty() ? 0 : 1; return vcs.isEmpty() ? 0 : 1;
} }

View File

@ -222,4 +222,15 @@ public class AlleleUnitTest extends BaseTest {
logger.warn("testBadConstructorArgs5"); logger.warn("testBadConstructorArgs5");
Allele.create("A A"); Allele.create("A A");
} }
@Test
public void testExtend() {
logger.warn("testExtend");
Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString());
Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString());
Assert.assertEquals("A", Allele.extend(Allele.create("-"), "A".getBytes()).toString());
Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString());
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString());
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString());
}
} }