diff --git a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java index 2cc56e610..1763d0865 100644 --- a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java +++ b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java @@ -128,7 +128,7 @@ public class VCFGenotypeEncoding { */ private static boolean validBases(String bases) { for (char c : bases.toUpperCase().toCharArray()) { - if (c != 'A' && c != 'C' && c != 'G' && c != 'T') + if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N') return false; } return true; diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java index f92a6d302..a595bee9b 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java @@ -167,7 +167,20 @@ public class Allele implements Comparable { // public Allele(byte base, boolean isRef) { return create( new byte[]{ base }, isRef); } - + + public static Allele extend(Allele left, byte[] right) { + byte[] bases = null; + if ( left.length() == 0 ) + bases = right; + else { + bases = new byte[left.length() + right.length]; + System.arraycopy(left.getBases(), 0, bases, 0, left.length()); + System.arraycopy(right, 0, bases, left.length(), right.length); + } + + return create(bases, left.isReference()); + } + /** * @param bases bases representing an allele * @return true if the bases represent the null allele diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java index eaf8f8d69..206c6d948 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java @@ -39,7 +39,7 @@ public class MutableGenotype extends Genotype { * @param alleles list of alleles */ public void setAlleles(List alleles) { - this.alleles.clear(); + this.alleles = new ArrayList(alleles); // todo -- add validation checking here @@ -51,14 +51,8 @@ public class MutableGenotype extends Genotype { if ( nNoCalls > 0 && nNoCalls != alleles.size() ) throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); - for ( Allele allele : alleles ) { - addAllele(allele); - } - } - - public void addAllele(Allele allele) { - if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); - this.alleles.add(allele); + for ( Allele allele : alleles ) + if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); } // --------------------------------------------------------------------------------------------------------- diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index 979a7a18d..e90b94cc7 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -237,7 +237,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes */ public VariantContext(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new HashMap(), genotypes) : null, negLog10PError, filters, attributes); + this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 95a208350..085515fc1 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -196,23 +196,33 @@ public class VariantContextUtils { Set alleles = new TreeSet(); Map genotypes = new TreeMap(); double negLog10PError = -1; - Set filters = new HashSet(); - Map attributes = new TreeMap(); + Set filters = new TreeSet(); + Map attributes = new TreeMap(first.getAttributes()); int depth = 0; // filtering values int nFiltered = 0; + Allele refAllele = determineReferenceAllele(VCs); + boolean remapped = false; + // cycle through and add info from the other VCs, making sure the loc/reference matches + for ( VariantContext vc : VCs ) { - if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) ) + if ( loc.getStart() != vc.getLocation().getStart() ) // || !first.getReference().equals(vc.getReference()) ) throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + if ( vc.getLocation().size() > loc.size() ) + loc = vc.getLocation(); // get the longest location + nFiltered += vc.isFiltered() ? 1 : 0; - alleles.addAll(vc.getAlleles()); + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); - mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES)); negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); @@ -244,9 +254,92 @@ public class VariantContextUtils { if ( depth > 0 ) attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); - return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + + VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + //if ( remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; } + private static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList(); + for ( Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + } + + static private Allele determineReferenceAllele(List VCs) { + Allele ref = null; + + for ( VariantContext vc : VCs ) { + Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new StingException("BUG: equal length references with difference bases: "+ ref + " " + myRef); + } + + return ref; + } + + static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + // we really need to do some work. The refAllele is the longest reference allele seen at this + // start site. So imagine it is: + // + // refAllele: ACGTGA + // myRef: ACGT + // myAlt: - + // + // We need to remap all of the alleles in vc to include the extra GA so that + // myRef => refAllele and myAlt => GA + // + + Allele myRef = vc.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new StingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + +// System.out.printf("Remapping allele at %s%n", vc); +// System.out.printf("ref %s%n", refAllele); +// System.out.printf("myref %s%n", myRef ); +// System.out.printf("extrabases %s%n", new String(extraBases)); + + Map map = new HashMap(); + for ( Allele a : vc.getAlleles() ) { + if ( a.isReference() ) + map.put(a, refAllele); + else { + Allele extended = Allele.extend(a, extraBases); + for ( Allele b : allAlleles ) + if ( extended.equals(b) ) + extended = b; +// System.out.printf(" Extending %s => %s%n", a, extended); + map.put(a, extended); + } + } + + // debugging +// System.out.printf("mapping %s%n", map); + + return new AlleleMapper(map); + } + } + + static class CompareByPriority implements Comparator { List priorityListOfVCs; public CompareByPriority(List priorityListOfVCs) { @@ -277,12 +370,19 @@ public class VariantContextUtils { } } - private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) { + private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { for ( Genotype g : oneVC.getGenotypes().values() ) { String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples); if ( ! mergedGenotypes.containsKey(name) ) { // only add if the name is new - Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g); + Genotype newG = g; + + if ( uniqifySamples || alleleMapping.needsRemapping() ) { + MutableGenotype mutG = new MutableGenotype(name, g); + if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); + newG = mutG; + } + mergedGenotypes.put(name, newG); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 1fae7a85a..958db810e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -69,7 +69,7 @@ public class CombineVariants extends RodWalker { //Set hInfo = new HashSet(); //hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - vcfWriter = new VCFWriter(out); + vcfWriter = new VCFWriter(out, true); priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); validateAnnotateUnionArguments(priority); @@ -82,7 +82,7 @@ public class CombineVariants extends RodWalker { } private Set getSampleList(Map headers, EnumSet mergeOptions ) { - Set samples = new HashSet(); + Set samples = new TreeSet(); for ( Map.Entry val : headers.entrySet() ) { VCFHeader header = val.getValue(); for ( String sample : header.getGenotypeSamples() ) { @@ -110,10 +110,10 @@ public class CombineVariants extends RodWalker { Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true); if ( mergedVC != null ) // only operate at the start of events - if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed - vcfWriter.add(mergedVC, ref.getBases()); - else - logger.info(String.format("Ignoring complex event: " + mergedVC)); + //if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed + vcfWriter.add(mergedVC, ref.getBases()); +// else +// logger.info(String.format("Ignoring complex event: " + mergedVC)); return vcs.isEmpty() ? 0 : 1; } diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java index 79c1e0859..d7bc8a2ee 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/AlleleUnitTest.java @@ -222,4 +222,15 @@ public class AlleleUnitTest extends BaseTest { logger.warn("testBadConstructorArgs5"); Allele.create("A A"); } + + @Test + public void testExtend() { + logger.warn("testExtend"); + Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString()); + Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString()); + Assert.assertEquals("A", Allele.extend(Allele.create("-"), "A".getBytes()).toString()); + Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString()); + Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString()); + Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString()); + } } \ No newline at end of file