From 61e2b2e39bbcd10f898e436e8ca298826fb1ca7b Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 2 Jul 2010 13:32:33 +0000 Subject: [PATCH] Nearly finalize merging capabilities for CombineVariants. Support for dealing with inconsistent indel alleles at loci. Improvements to Allele and removal of addAllele to MutableGenotype. We are close to being able to merge all of 1000 genomes -- snps and indels -- into a single combined vcf git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3710 348d0f76-0448-11de-a6fe-93d51630548a --- .../tribble/vcf/VCFGenotypeEncoding.java | 2 +- .../gatk/contexts/variantcontext/Allele.java | 15 ++- .../variantcontext/MutableGenotype.java | 12 +- .../variantcontext/VariantContext.java | 2 +- .../variantcontext/VariantContextUtils.java | 116 ++++++++++++++++-- .../walkers/variantutils/CombineVariants.java | 12 +- .../variantcontext/AlleleUnitTest.java | 11 ++ 7 files changed, 144 insertions(+), 26 deletions(-) diff --git a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java index 2cc56e610..1763d0865 100644 --- a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java +++ b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java @@ -128,7 +128,7 @@ public class VCFGenotypeEncoding { */ private static boolean validBases(String bases) { for (char c : bases.toUpperCase().toCharArray()) { - if (c != 'A' && c != 'C' && c != 'G' && c != 'T') + if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N') return false; } return true; diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java index f92a6d302..a595bee9b 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java @@ -167,7 +167,20 @@ public class Allele implements Comparable { // public Allele(byte base, boolean isRef) { return create( new byte[]{ base }, isRef); } - + + public static Allele extend(Allele left, byte[] right) { + byte[] bases = null; + if ( left.length() == 0 ) + bases = right; + else { + bases = new byte[left.length() + right.length]; + System.arraycopy(left.getBases(), 0, bases, 0, left.length()); + System.arraycopy(right, 0, bases, left.length(), right.length); + } + + return create(bases, left.isReference()); + } + /** * @param bases bases representing an allele * @return true if the bases represent the null allele diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java index eaf8f8d69..206c6d948 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java @@ -39,7 +39,7 @@ public class MutableGenotype extends Genotype { * @param alleles list of alleles */ public void setAlleles(List alleles) { - this.alleles.clear(); + this.alleles = new ArrayList(alleles); // todo -- add validation checking here @@ -51,14 +51,8 @@ public class MutableGenotype extends Genotype { if ( nNoCalls > 0 && nNoCalls != alleles.size() ) throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); - for ( Allele allele : alleles ) { - addAllele(allele); - } - } - - public void addAllele(Allele allele) { - if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); - this.alleles.add(allele); + for ( Allele allele : alleles ) + if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); } // --------------------------------------------------------------------------------------------------------- diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index 979a7a18d..e90b94cc7 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -237,7 +237,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes */ public VariantContext(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new HashMap(), genotypes) : null, negLog10PError, filters, attributes); + this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 95a208350..085515fc1 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -196,23 +196,33 @@ public class VariantContextUtils { Set alleles = new TreeSet(); Map genotypes = new TreeMap(); double negLog10PError = -1; - Set filters = new HashSet(); - Map attributes = new TreeMap(); + Set filters = new TreeSet(); + Map attributes = new TreeMap(first.getAttributes()); int depth = 0; // filtering values int nFiltered = 0; + Allele refAllele = determineReferenceAllele(VCs); + boolean remapped = false; + // cycle through and add info from the other VCs, making sure the loc/reference matches + for ( VariantContext vc : VCs ) { - if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) ) + if ( loc.getStart() != vc.getLocation().getStart() ) // || !first.getReference().equals(vc.getReference()) ) throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + if ( vc.getLocation().size() > loc.size() ) + loc = vc.getLocation(); // get the longest location + nFiltered += vc.isFiltered() ? 1 : 0; - alleles.addAll(vc.getAlleles()); + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); - mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); @@ -244,9 +254,92 @@ public class VariantContextUtils { if ( depth > 0 ) attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); - return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + + VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + //if ( remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; } + private static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList(); + for ( Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + } + + static private Allele determineReferenceAllele(List VCs) { + Allele ref = null; + + for ( VariantContext vc : VCs ) { + Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new StingException("BUG: equal length references with difference bases: "+ ref + " " + myRef); + } + + return ref; + } + + static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + // we really need to do some work. The refAllele is the longest reference allele seen at this + // start site. So imagine it is: + // + // refAllele: ACGTGA + // myRef: ACGT + // myAlt: - + // + // We need to remap all of the alleles in vc to include the extra GA so that + // myRef => refAllele and myAlt => GA + // + + Allele myRef = vc.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new StingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + +// System.out.printf("Remapping allele at %s%n", vc); +// System.out.printf("ref %s%n", refAllele); +// System.out.printf("myref %s%n", myRef ); +// System.out.printf("extrabases %s%n", new String(extraBases)); + + Map map = new HashMap(); + for ( Allele a : vc.getAlleles() ) { + if ( a.isReference() ) + map.put(a, refAllele); + else { + Allele extended = Allele.extend(a, extraBases); + for ( Allele b : allAlleles ) + if ( extended.equals(b) ) + extended = b; +// System.out.printf(" Extending %s => %s%n", a, extended); + map.put(a, extended); + } + } + + // debugging +// System.out.printf("mapping %s%n", map); + + return new AlleleMapper(map); + } + } + + static class CompareByPriority implements Comparator { List priorityListOfVCs; public CompareByPriority(List priorityListOfVCs) { @@ -277,12 +370,19 @@ public class VariantContextUtils { } } - private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) { + private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { for ( Genotype g : oneVC.getGenotypes().values() ) { String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples); if ( ! mergedGenotypes.containsKey(name) ) { // only add if the name is new - Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g); + Genotype newG = g; + + if ( uniqifySamples || alleleMapping.needsRemapping() ) { + MutableGenotype mutG = new MutableGenotype(name, g); + if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); + newG = mutG; + } + mergedGenotypes.put(name, newG); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 1fae7a85a..958db810e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -69,7 +69,7 @@ public class CombineVariants extends RodWalker { //Set hInfo = new HashSet(); //hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - vcfWriter = new VCFWriter(out); + vcfWriter = new VCFWriter(out, true); priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); validateAnnotateUnionArguments(priority); @@ -82,7 +82,7 @@ public class CombineVariants extends RodWalker { } private Set getSampleList(Map headers, EnumSet mergeOptions ) { - Set samples = new HashSet(); + Set samples = new TreeSet(); for ( Map.Entry val : headers.entrySet() ) { VCFHeader header = val.getValue(); for ( String sample : header.getGenotypeSamples() ) { @@ -110,10 +110,10 @@ public class CombineVariants extends RodWalker { Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true); if ( mergedVC != null ) // only operate at the start of events - if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed - vcfWriter.add(mergedVC, ref.getBases()); - else - logger.info(String.format("Ignoring complex event: " + mergedVC)); + //if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed + vcfWriter.add(mergedVC, ref.getBases()); +// else +// logger.info(String.format("Ignoring complex event: " + mergedVC)); return vcs.isEmpty() ? 0 : 1; } diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java index 79c1e0859..d7bc8a2ee 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java @@ -222,4 +222,15 @@ public class AlleleUnitTest extends BaseTest { logger.warn("testBadConstructorArgs5"); Allele.create("A A"); } + + @Test + public void testExtend() { + logger.warn("testExtend"); + Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString()); + Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString()); + Assert.assertEquals("A", Allele.extend(Allele.create("-"), "A".getBytes()).toString()); + Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString()); + Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString()); + Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString()); + } } \ No newline at end of file