clip VCF alleles for indels: only a single left base, and as many right bases as align before converting to variant context.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3614 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-22 22:42:38 +00:00
parent 9872b65803
commit 0cafd3d642
2 changed files with 34 additions and 25 deletions

View File

@ -409,44 +409,29 @@ public class VCF4Codec implements FeatureCodec {
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
List<Allele> newAlleleList = new ArrayList<Allele>();
// find the smallest allele
int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE;
for (Allele smallest : unclippedAlleles)
if (smallest.length() < minimumAllele)
minimumAllele = smallest.length();
// find the preceeding string common to all alleles and the reference
int forwardClipped = 0;
boolean clipping = true;
while (clipping) {
for (Allele a : unclippedAlleles)
if (forwardClipped > ref.length() - 1)
clipping = false;
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
for (Allele a : unclippedAlleles)
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
clipping = false;
}
if (clipping) forwardClipped++;
}
int forwardClipping = (clipping) ? 1 : 0;
int reverseClipped = 0;
clipping = true;
while (clipping) {
for (Allele a : unclippedAlleles)
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
clipping = false;
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
clipping = false;
if (clipping) reverseClipped++;
}
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
//if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele)
// forwardClipped--;
for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
newAlleleList);
}

View File

@ -199,8 +199,7 @@ public class VCF4UnitTest extends BaseTest {
public void checkLargeVCF() {
TestSetup testSetup = new TestSetup().invoke(largeVCF);
AsciiLineReader reader = testSetup.getReader();
VCF4Codec codec = testSetup.getCodec();
// now parse the lines
String line = null;
try {
@ -299,6 +298,30 @@ public class VCF4UnitTest extends BaseTest {
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingManyPotentialFrontClippedBases() {
String ref = "GGGGTT";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("GGGGAATT",false));
alleles.add(Allele.create("GGGT",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,5)));
// we know the ordering
//System.err.println(locAndList.second.get(0).toString());
//System.err.println(locAndList.second.get(1).toString());
//System.err.println(locAndList.second.get(2).toString());
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGGT*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("GGGAAT"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("GG"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@ -322,6 +345,7 @@ public class VCF4UnitTest extends BaseTest {
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
* TODO - this is kind of a tricky test... we don't know which way clipped the reads, but the position should be accurate
*/
@Test
public void testClippingOfAllelesLongRefRepeatClippable() {
@ -332,7 +356,7 @@ public class VCF4UnitTest extends BaseTest {
alleles.add(Allele.create("GGG",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5)));
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
// we know the ordering
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));