clip VCF alleles for indels: only a single left base, and as many right bases as align before converting to variant context.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3614 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9872b65803
commit
0cafd3d642
|
|
@ -409,44 +409,29 @@ public class VCF4Codec implements FeatureCodec {
|
|||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||
|
||||
// find the smallest allele
|
||||
int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE;
|
||||
for (Allele smallest : unclippedAlleles)
|
||||
if (smallest.length() < minimumAllele)
|
||||
minimumAllele = smallest.length();
|
||||
|
||||
|
||||
// find the preceeding string common to all alleles and the reference
|
||||
int forwardClipped = 0;
|
||||
boolean clipping = true;
|
||||
while (clipping) {
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (forwardClipped > ref.length() - 1)
|
||||
clipping = false;
|
||||
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
|
||||
clipping = false;
|
||||
}
|
||||
if (clipping) forwardClipped++;
|
||||
}
|
||||
int forwardClipping = (clipping) ? 1 : 0;
|
||||
|
||||
int reverseClipped = 0;
|
||||
clipping = true;
|
||||
while (clipping) {
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
|
||||
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
|
||||
clipping = false;
|
||||
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||
clipping = false;
|
||||
if (clipping) reverseClipped++;
|
||||
}
|
||||
|
||||
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
|
||||
//if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele)
|
||||
// forwardClipped--;
|
||||
|
||||
|
||||
for (Allele a : unclippedAlleles)
|
||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
|
||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
|
||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
|
||||
newAlleleList);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -199,8 +199,7 @@ public class VCF4UnitTest extends BaseTest {
|
|||
public void checkLargeVCF() {
|
||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
VCF4Codec codec = testSetup.getCodec();
|
||||
|
||||
|
||||
// now parse the lines
|
||||
String line = null;
|
||||
try {
|
||||
|
|
@ -299,6 +298,30 @@ public class VCF4UnitTest extends BaseTest {
|
|||
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingManyPotentialFrontClippedBases() {
|
||||
String ref = "GGGGTT";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("GGGGAATT",false));
|
||||
alleles.add(Allele.create("GGGT",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,5)));
|
||||
|
||||
// we know the ordering
|
||||
//System.err.println(locAndList.second.get(0).toString());
|
||||
//System.err.println(locAndList.second.get(1).toString());
|
||||
//System.err.println(locAndList.second.get(2).toString());
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGGT*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("GGGAAT"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("GG"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
|
|
@ -322,6 +345,7 @@ public class VCF4UnitTest extends BaseTest {
|
|||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
* TODO - this is kind of a tricky test... we don't know which way clipped the reads, but the position should be accurate
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesLongRefRepeatClippable() {
|
||||
|
|
@ -332,7 +356,7 @@ public class VCF4UnitTest extends BaseTest {
|
|||
alleles.add(Allele.create("GGG",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5)));
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
|
||||
|
||||
// we know the ordering
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
||||
|
|
|
|||
Loading…
Reference in New Issue