Nearly finalize merging capabilities for CombineVariants. Support for dealing with inconsistent indel alleles at loci. Improvements to Allele and removal of addAllele to MutableGenotype. We are close to being able to merge all of 1000 genomes -- snps and indels -- into a single combined vcf
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3710 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cab8394103
commit
61e2b2e39b
|
|
@ -128,7 +128,7 @@ public class VCFGenotypeEncoding {
|
|||
*/
|
||||
private static boolean validBases(String bases) {
|
||||
for (char c : bases.toUpperCase().toCharArray()) {
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T')
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N')
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -167,7 +167,20 @@ public class Allele implements Comparable<Allele> {
|
|||
// public Allele(byte base, boolean isRef) {
|
||||
return create( new byte[]{ base }, isRef);
|
||||
}
|
||||
|
||||
|
||||
public static Allele extend(Allele left, byte[] right) {
|
||||
byte[] bases = null;
|
||||
if ( left.length() == 0 )
|
||||
bases = right;
|
||||
else {
|
||||
bases = new byte[left.length() + right.length];
|
||||
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
|
||||
System.arraycopy(right, 0, bases, left.length(), right.length);
|
||||
}
|
||||
|
||||
return create(bases, left.isReference());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the null allele
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class MutableGenotype extends Genotype {
|
|||
* @param alleles list of alleles
|
||||
*/
|
||||
public void setAlleles(List<Allele> alleles) {
|
||||
this.alleles.clear();
|
||||
this.alleles = new ArrayList<Allele>(alleles);
|
||||
|
||||
// todo -- add validation checking here
|
||||
|
||||
|
|
@ -51,14 +51,8 @@ public class MutableGenotype extends Genotype {
|
|||
if ( nNoCalls > 0 && nNoCalls != alleles.size() )
|
||||
throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
|
||||
|
||||
for ( Allele allele : alleles ) {
|
||||
addAllele(allele);
|
||||
}
|
||||
}
|
||||
|
||||
public void addAllele(Allele allele) {
|
||||
if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype");
|
||||
this.alleles.add(allele);
|
||||
for ( Allele allele : alleles )
|
||||
if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -237,7 +237,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @param attributes
|
||||
*/
|
||||
public VariantContext(String name, GenomeLoc loc, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
||||
this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new HashMap<String, Genotype>(), genotypes) : null, negLog10PError, filters, attributes);
|
||||
this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap<String, Genotype>(), genotypes) : null, negLog10PError, filters, attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -196,23 +196,33 @@ public class VariantContextUtils {
|
|||
Set<Allele> alleles = new TreeSet<Allele>();
|
||||
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
|
||||
double negLog10PError = -1;
|
||||
Set<String> filters = new HashSet<String>();
|
||||
Map<String, String> attributes = new TreeMap<String, String>();
|
||||
Set<String> filters = new TreeSet<String>();
|
||||
Map<String, Object> attributes = new TreeMap<String, Object>(first.getAttributes());
|
||||
int depth = 0;
|
||||
|
||||
// filtering values
|
||||
int nFiltered = 0;
|
||||
|
||||
Allele refAllele = determineReferenceAllele(VCs);
|
||||
boolean remapped = false;
|
||||
|
||||
// cycle through and add info from the other VCs, making sure the loc/reference matches
|
||||
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) )
|
||||
if ( loc.getStart() != vc.getLocation().getStart() ) // || !first.getReference().equals(vc.getReference()) )
|
||||
throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
|
||||
|
||||
if ( vc.getLocation().size() > loc.size() )
|
||||
loc = vc.getLocation(); // get the longest location
|
||||
|
||||
nFiltered += vc.isFiltered() ? 1 : 0;
|
||||
|
||||
alleles.addAll(vc.getAlleles());
|
||||
AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles);
|
||||
remapped = remapped || alleleMapping.needsRemapping();
|
||||
|
||||
mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));
|
||||
alleles.addAll(alleleMapping.values());
|
||||
|
||||
mergeGenotypes(genotypes, vc, alleleMapping, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));
|
||||
|
||||
negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1);
|
||||
|
||||
|
|
@ -244,9 +254,92 @@ public class VariantContextUtils {
|
|||
|
||||
if ( depth > 0 )
|
||||
attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
|
||||
return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
|
||||
VariantContext merged = new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
//if ( remapped ) System.out.printf("Remapped => %s%n", merged);
|
||||
return merged;
|
||||
}
|
||||
|
||||
private static class AlleleMapper {
|
||||
private VariantContext vc = null;
|
||||
private Map<Allele, Allele> map = null;
|
||||
public AlleleMapper(VariantContext vc) { this.vc = vc; }
|
||||
public AlleleMapper(Map<Allele, Allele> map) { this.map = map; }
|
||||
public boolean needsRemapping() { return this.map != null; }
|
||||
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
|
||||
|
||||
public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; }
|
||||
|
||||
public List<Allele> remap(List<Allele> as) {
|
||||
List<Allele> newAs = new ArrayList<Allele>();
|
||||
for ( Allele a : as ) {
|
||||
//System.out.printf(" Remapping %s => %s%n", a, remap(a));
|
||||
newAs.add(remap(a));
|
||||
}
|
||||
return newAs;
|
||||
}
|
||||
}
|
||||
|
||||
static private Allele determineReferenceAllele(List<VariantContext> VCs) {
|
||||
Allele ref = null;
|
||||
|
||||
for ( VariantContext vc : VCs ) {
|
||||
Allele myRef = vc.getReference();
|
||||
if ( ref == null || ref.length() < myRef.length() )
|
||||
ref = myRef;
|
||||
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
|
||||
throw new StingException("BUG: equal length references with difference bases: "+ ref + " " + myRef);
|
||||
}
|
||||
|
||||
return ref;
|
||||
}
|
||||
|
||||
static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) {
|
||||
if ( refAllele.equals(vc.getReference()) )
|
||||
return new AlleleMapper(vc);
|
||||
else {
|
||||
// we really need to do some work. The refAllele is the longest reference allele seen at this
|
||||
// start site. So imagine it is:
|
||||
//
|
||||
// refAllele: ACGTGA
|
||||
// myRef: ACGT
|
||||
// myAlt: -
|
||||
//
|
||||
// We need to remap all of the alleles in vc to include the extra GA so that
|
||||
// myRef => refAllele and myAlt => GA
|
||||
//
|
||||
|
||||
Allele myRef = vc.getReference();
|
||||
if ( refAllele.length() <= myRef.length() ) throw new StingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
|
||||
byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
|
||||
|
||||
// System.out.printf("Remapping allele at %s%n", vc);
|
||||
// System.out.printf("ref %s%n", refAllele);
|
||||
// System.out.printf("myref %s%n", myRef );
|
||||
// System.out.printf("extrabases %s%n", new String(extraBases));
|
||||
|
||||
Map<Allele, Allele> map = new HashMap<Allele, Allele>();
|
||||
for ( Allele a : vc.getAlleles() ) {
|
||||
if ( a.isReference() )
|
||||
map.put(a, refAllele);
|
||||
else {
|
||||
Allele extended = Allele.extend(a, extraBases);
|
||||
for ( Allele b : allAlleles )
|
||||
if ( extended.equals(b) )
|
||||
extended = b;
|
||||
// System.out.printf(" Extending %s => %s%n", a, extended);
|
||||
map.put(a, extended);
|
||||
}
|
||||
}
|
||||
|
||||
// debugging
|
||||
// System.out.printf("mapping %s%n", map);
|
||||
|
||||
return new AlleleMapper(map);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static class CompareByPriority implements Comparator<VariantContext> {
|
||||
List<String> priorityListOfVCs;
|
||||
public CompareByPriority(List<String> priorityListOfVCs) {
|
||||
|
|
@ -277,12 +370,19 @@ public class VariantContextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) {
|
||||
private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) {
|
||||
for ( Genotype g : oneVC.getGenotypes().values() ) {
|
||||
String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples);
|
||||
if ( ! mergedGenotypes.containsKey(name) ) {
|
||||
// only add if the name is new
|
||||
Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g);
|
||||
Genotype newG = g;
|
||||
|
||||
if ( uniqifySamples || alleleMapping.needsRemapping() ) {
|
||||
MutableGenotype mutG = new MutableGenotype(name, g);
|
||||
if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles()));
|
||||
newG = mutG;
|
||||
}
|
||||
|
||||
mergedGenotypes.put(name, newG);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
//Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
//hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
|
||||
vcfWriter = new VCFWriter(out);
|
||||
vcfWriter = new VCFWriter(out, true);
|
||||
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
|
||||
|
||||
validateAnnotateUnionArguments(priority);
|
||||
|
|
@ -82,7 +82,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
|
||||
Set<String> samples = new HashSet<String>();
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
|
|
@ -110,10 +110,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
|
||||
if ( mergedVC != null ) // only operate at the start of events
|
||||
if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
|
||||
vcfWriter.add(mergedVC, ref.getBases());
|
||||
else
|
||||
logger.info(String.format("Ignoring complex event: " + mergedVC));
|
||||
//if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
|
||||
vcfWriter.add(mergedVC, ref.getBases());
|
||||
// else
|
||||
// logger.info(String.format("Ignoring complex event: " + mergedVC));
|
||||
|
||||
return vcs.isEmpty() ? 0 : 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -222,4 +222,15 @@ public class AlleleUnitTest extends BaseTest {
|
|||
logger.warn("testBadConstructorArgs5");
|
||||
Allele.create("A A");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtend() {
|
||||
logger.warn("testExtend");
|
||||
Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString());
|
||||
Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString());
|
||||
Assert.assertEquals("A", Allele.extend(Allele.create("-"), "A".getBytes()).toString());
|
||||
Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString());
|
||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString());
|
||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue