clip to the null allele on the reference string in VCF 4, instead of stopping to perserve one reference base.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3613 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-22 20:52:19 +00:00
parent b5df2705c9
commit 9872b65803
2 changed files with 75 additions and 5 deletions

View File

@ -409,6 +409,13 @@ public class VCF4Codec implements FeatureCodec {
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
List<Allele> newAlleleList = new ArrayList<Allele>();
// find the smallest allele
int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE;
for (Allele smallest : unclippedAlleles)
if (smallest.length() < minimumAllele)
minimumAllele = smallest.length();
// find the preceeding string common to all alleles and the reference
int forwardClipped = 0;
boolean clipping = true;
@ -434,8 +441,8 @@ public class VCF4Codec implements FeatureCodec {
}
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
if (forwardClipped + reverseClipped >= ref.length())
forwardClipped--;
//if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele)
// forwardClipped--;
for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));

View File

@ -193,9 +193,9 @@ public class VCF4UnitTest extends BaseTest {
testSetup.codec.decode(twoManyInfoLine);
}
/* File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
@Test
//@Test
public void checkLargeVCF() {
TestSetup testSetup = new TestSetup().invoke(largeVCF);
AsciiLineReader reader = testSetup.getReader();
@ -212,6 +212,7 @@ public class VCF4UnitTest extends BaseTest {
// our record count
int recordCount = 0;
int badRecordCount = 0;
long milliseconds = System.currentTimeMillis();
while (line != null) {
try {
recordCount++;
@ -229,10 +230,51 @@ public class VCF4UnitTest extends BaseTest {
Assert.fail("Failed to read a line");
}
}
System.err.println("Total time = " + (System.currentTimeMillis() - milliseconds));
Assert.assertEquals(0,badRecordCount);
Assert.assertEquals(728075,recordCount);
}
*/
//@Test
public void checkBobsCNVVCF() {
TestSetup testSetup = new TestSetup().invoke(new File("bobs.vcf"));
AsciiLineReader reader = testSetup.getReader();
VCF4Codec codec = testSetup.getCodec();
// now parse the lines
String line = null;
try {
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
// our record count
int recordCount = 0;
int badRecordCount = 0;
long milliseconds = System.currentTimeMillis();
while (line != null) {
try {
recordCount++;
try {
testSetup.codec.decode(line);
} catch (Exception e) {
System.err.println(e.getMessage() + " -> " + line);
System.err.println(line);
badRecordCount++;
}
line = reader.readLine();
if (recordCount % 1000 == 0)
System.err.println("record count == " + recordCount);
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
}
System.err.println("Total time = " + (System.currentTimeMillis() - milliseconds));
Assert.assertEquals(0,badRecordCount);
Assert.assertEquals(15947,recordCount);
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@ -278,6 +320,27 @@ public class VCF4UnitTest extends BaseTest {
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingOfAllelesLongRefRepeatClippable() {
String ref = "GGGGG";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("GG",false));
alleles.add(Allele.create("GGG",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5)));
// we know the ordering
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("-"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/