Working implementation of DecodeLoc for VCF parsing. Makes indexing 3x faster.

This commit is contained in:
Eric Banks 2011-08-22 15:11:21 -04:00
parent 518b3dd291
commit 2c24b68a96
1 changed files with 48 additions and 10 deletions

View File

@ -154,9 +154,45 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
* @return a feature, (not guaranteed complete) that has the correct start and stop * @return a feature, (not guaranteed complete) that has the correct start and stop
*/ */
public Feature decodeLoc(String line) { public Feature decodeLoc(String line) {
return reallyDecode(line); String[] locParts = new String[6];
ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
// get our alleles (because the end position depends on them)
String ref = getCachedString(locParts[3].toUpperCase());
String alts = getCachedString(locParts[4].toUpperCase());
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
// find out our location
int start = Integer.valueOf(locParts[1]);
int stop = start;
// ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) {
stop = start + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
stop = clipAlleles(start, ref, alleles, null, lineNo);
}
return new VCFLocFeature(locParts[0], start, stop);
} }
private final static class VCFLocFeature implements Feature {
final String chr;
final int start, stop;
private VCFLocFeature(String chr, int start, int stop) {
this.chr = chr;
this.start = start;
this.stop = stop;
}
public String getChr() { return chr; }
public int getStart() { return start; }
public int getEnd() { return stop; }
}
/** /**
* decode the line into a feature (VariantContext) * decode the line into a feature (VariantContext)
* @param line the line * @param line the line
@ -207,7 +243,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
// parse out the required fields // parse out the required fields
String contig = getCachedString(parts[0]); String contig = getCachedString(parts[0]);
long pos = Long.valueOf(parts[1]); int pos = Integer.valueOf(parts[1]);
String id = null; String id = null;
if ( parts[2].length() == 0 ) if ( parts[2].length() == 0 )
generateException("The VCF specification requires a valid ID field"); generateException("The VCF specification requires a valid ID field");
@ -227,7 +263,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
Map<String, Object> attributes = parseInfo(info, id); Map<String, Object> attributes = parseInfo(info, id);
// find out our current location, and clip the alleles down to their minimum length // find out our current location, and clip the alleles down to their minimum length
long loc = pos; int loc = pos;
// ref alleles don't need to be single bases for monomorphic sites // ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) { if ( alleles.size() == 1 ) {
loc = pos + alleles.get(0).length() - 1; loc = pos + alleles.get(0).length() - 1;
@ -506,9 +542,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
* @param ref the reference string * @param ref the reference string
* @param unclippedAlleles the list of unclipped alleles * @param unclippedAlleles the list of unclipped alleles
* @param clippedAlleles output list of clipped alleles * @param clippedAlleles output list of clipped alleles
* @return a list of alleles, clipped to the reference * @return the new reference end position of this event
*/ */
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) { protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
// Note that the computation of forward clipping here is meant only to see whether there is a common // Note that the computation of forward clipping here is meant only to see whether there is a common
// base to all alleles, and to correctly compute reverse clipping, // base to all alleles, and to correctly compute reverse clipping,
@ -534,11 +570,13 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
if (clipping) reverseClipped++; if (clipping) reverseClipped++;
} }
for (Allele a : unclippedAlleles) { if ( clippedAlleles != null ) {
if (a.isSymbolic()) { for ( Allele a : unclippedAlleles ) {
clippedAlleles.add(a); if ( a.isSymbolic() ) {
} else { clippedAlleles.add(a);
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference())); } else {
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
}
} }
} }