clip VCF alleles for indels: only a single left base, and as many right bases as align before converting to variant context.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3614 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9872b65803
commit
0cafd3d642
|
|
@ -409,44 +409,29 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||||
|
|
||||||
// find the smallest allele
|
|
||||||
int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE;
|
|
||||||
for (Allele smallest : unclippedAlleles)
|
|
||||||
if (smallest.length() < minimumAllele)
|
|
||||||
minimumAllele = smallest.length();
|
|
||||||
|
|
||||||
|
|
||||||
// find the preceeding string common to all alleles and the reference
|
// find the preceeding string common to all alleles and the reference
|
||||||
int forwardClipped = 0;
|
|
||||||
boolean clipping = true;
|
boolean clipping = true;
|
||||||
while (clipping) {
|
for (Allele a : unclippedAlleles)
|
||||||
for (Allele a : unclippedAlleles)
|
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
|
||||||
if (forwardClipped > ref.length() - 1)
|
|
||||||
clipping = false;
|
|
||||||
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
|
|
||||||
clipping = false;
|
clipping = false;
|
||||||
}
|
}
|
||||||
if (clipping) forwardClipped++;
|
int forwardClipping = (clipping) ? 1 : 0;
|
||||||
}
|
|
||||||
|
|
||||||
int reverseClipped = 0;
|
int reverseClipped = 0;
|
||||||
clipping = true;
|
clipping = true;
|
||||||
while (clipping) {
|
while (clipping) {
|
||||||
for (Allele a : unclippedAlleles)
|
for (Allele a : unclippedAlleles)
|
||||||
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
|
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
|
||||||
clipping = false;
|
clipping = false;
|
||||||
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||||
clipping = false;
|
clipping = false;
|
||||||
if (clipping) reverseClipped++;
|
if (clipping) reverseClipped++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
|
|
||||||
//if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele)
|
|
||||||
// forwardClipped--;
|
|
||||||
|
|
||||||
for (Allele a : unclippedAlleles)
|
for (Allele a : unclippedAlleles)
|
||||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
|
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
||||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
|
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
|
||||||
newAlleleList);
|
newAlleleList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -199,8 +199,7 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
public void checkLargeVCF() {
|
public void checkLargeVCF() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||||
AsciiLineReader reader = testSetup.getReader();
|
AsciiLineReader reader = testSetup.getReader();
|
||||||
VCF4Codec codec = testSetup.getCodec();
|
|
||||||
|
|
||||||
// now parse the lines
|
// now parse the lines
|
||||||
String line = null;
|
String line = null;
|
||||||
try {
|
try {
|
||||||
|
|
@ -299,6 +298,30 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testClippingManyPotentialFrontClippedBases() {
|
||||||
|
String ref = "GGGGTT";
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create(ref,true));
|
||||||
|
alleles.add(Allele.create("GGGGAATT",false));
|
||||||
|
alleles.add(Allele.create("GGGT",false));
|
||||||
|
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,5)));
|
||||||
|
|
||||||
|
// we know the ordering
|
||||||
|
//System.err.println(locAndList.second.get(0).toString());
|
||||||
|
//System.err.println(locAndList.second.get(1).toString());
|
||||||
|
//System.err.println(locAndList.second.get(2).toString());
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGGT*"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||||
|
Assert.assertTrue(locAndList.second.get(1).toString().equals("GGGAAT"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("GG"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
*/
|
*/
|
||||||
|
|
@ -322,6 +345,7 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
* TODO - this is kind of a tricky test... we don't know which way clipped the reads, but the position should be accurate
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testClippingOfAllelesLongRefRepeatClippable() {
|
public void testClippingOfAllelesLongRefRepeatClippable() {
|
||||||
|
|
@ -332,7 +356,7 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
alleles.add(Allele.create("GGG",false));
|
alleles.add(Allele.create("GGG",false));
|
||||||
|
|
||||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5)));
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
|
||||||
|
|
||||||
// we know the ordering
|
// we know the ordering
|
||||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue