Unit tests for -XL arguments; expt. annotation calculating the GC content within 100 bp of the current SNP

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2997 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-14 21:08:14 +00:00
parent e367a50e9b
commit 4dd7c5972c
6 changed files with 189 additions and 56 deletions

View File

@ -158,7 +158,22 @@ public class GenomeAnalysisEngine {
// create the output streams
initializeOutputStreams(my_walker, microScheduler.getOutputTracker());
// todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
initializeIntervals();
ShardStrategy shardStrategy = getShardStrategy(my_walker,
microScheduler.getReference(),
intervals,
argCollection.maximumEngineIterations,
readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
// execute the microscheduler, storing the results
return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
}
/**
* Setup the intervals to be processed
*/
private void initializeIntervals() {
GenomeLocSortedSet excludeIntervals = null;
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
@ -171,17 +186,18 @@ public class GenomeAnalysisEngine {
if ( excludeIntervals != null ) {
GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
intervals = pruneIntervals( toPrune, excludeIntervals );
long toPruneSize = toPrune.coveredSize();
long toExcludeSize = excludeIntervals.coveredSize();
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
intervals = toPrune.substractRegions( excludeIntervals );
long intervalSize = intervals.coveredSize();
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
ShardStrategy shardStrategy = getShardStrategy(my_walker,
microScheduler.getReference(),
intervals,
argCollection.maximumEngineIterations,
readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
// execute the microscheduler, storing the results
return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
if ( intervals != null )
logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
}
/**
@ -306,16 +322,6 @@ public class GenomeAnalysisEngine {
return microScheduler;
}
private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
//for ( GenomeLoc exclude : toExclude )
// toPrune.removeRegion(exclude);
GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
logger.info(String.format("done pruning intervals == now have %d", x.size()));
return x;
}
/**
* setup the interval regions, from either the interval file of the genome region string
*

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
public class GCContent implements VariantAnnotation {
public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
double content = computeGCContent(ref);
return String.format("%.2f", content);
}
public String getKeyName() { return "GC"; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GC", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "GC content within 20 bp +/- the variant"); }
public boolean useZeroQualityReads() { return false; }
private static double computeGCContent(ReferenceContext ref) {
int gc = 0, at = 0;
for ( char base : ref.getBases() ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex )
gc++;
else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex )
at++;
else
; // ignore
}
int sum = gc + at;
return (100.0*gc) / (sum == 0 ? 1 : sum);
}
}

View File

@ -20,7 +20,7 @@ import java.io.*;
*/
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@Reference(window=@Window(start=-20,stop=20))
@Reference(window=@Window(start=-50,stop=50))
@By(DataSource.REFERENCE)
public class VariantAnnotator extends LocusWalker<Integer, Integer> {
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)

View File

@ -14,6 +14,12 @@ public class BaseUtils {
public static final byte DELETION_INDEX = 4;
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
public static int gIndex = BaseUtils.simpleBaseToBaseIndex('G');
public static int cIndex = BaseUtils.simpleBaseToBaseIndex('C');
public static int aIndex = BaseUtils.simpleBaseToBaseIndex('A');
public static int tIndex = BaseUtils.simpleBaseToBaseIndex('T');
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
// Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.

View File

@ -44,6 +44,18 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
public GenomeLocSortedSet() {
}
public GenomeLocSortedSet(GenomeLoc e) {
this();
add(e);
}
public GenomeLocSortedSet(Collection<GenomeLoc> l) {
this();
for ( GenomeLoc e : l )
add(e);
}
/**
* get an iterator over this collection
*
@ -62,6 +74,17 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return mArray.size();
}
/**
* Return the size, in bp, of the genomic regions by all of the regions in this set
* @return size in bp of the covered regions
*/
public long coveredSize() {
long s = 0;
for ( GenomeLoc e : this )
s += e.size();
return s;
}
/**
* determine if the collection is empty
*
@ -180,7 +203,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return GenomeLocSortedSet.createSetFromList(good);
}
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
if (g.equals(e)) {
@ -243,39 +265,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
}
// public boolean removeRegions(GenomeLocSortedSet toRemove) {
// int i = 0, j = 0;
//
// while ( true ) {
// if ( i >= mArray.size() || j >= toRemove.mArray.size() )
// return true;
//
// GenomeLoc g = mArray.get(i);
// GenomeLoc e = toRemove.mArray.get(j);
//
// if (g.overlapsP(e)) {
// boolean finishEarly = removeOverlappingRegion(g, e);
// if ( finishEarly ) {
// i++;
// }
// } else if ( g.compareContigs(e) < 0 ) {
// i++; // g is on an earlier contig, move g forward
// } else if ( g.compareContigs(e) > 0 ) {
// j++; // g is on a later contig, move e forward
// } else if ( g.getStop() < e.getStart() ) {
// i++; // g stops before e starts, move g forward
// } else if ( e.getStop() < g.getStart() ) {
// j++; // g starts after e stops, move e forward
// } else {
// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
// }
//
// if ( (i + j) % 10000 == 0 )
// logger.debug("removeRegions operation: i + j = " + ( i + j ));
// }
// }
//
/**
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
* @param location the GenomeLoc to remove
@ -373,4 +362,15 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return this.mArray;
}
public String toString() {
StringBuilder s = new StringBuilder();
s.append("[");
for ( GenomeLoc e : this ) {
s.append(" ");
s.append(e.toString());
}
s.append("]");
return s.toString();
}
}

View File

@ -9,6 +9,7 @@ import org.junit.Before;
import org.junit.Test;
import java.util.Iterator;
import java.util.Arrays;
/**
*
@ -120,10 +121,87 @@ public class GenomeLocSortedSetTest extends BaseTest {
assertTrue(loc.getContigIndex() == 1);
}
@Test
public void deleteAllByRegion() {
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
mSortedSet.add(e);
for (int x = 1; x < 101; x++) {
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
}
assertTrue(mSortedSet.isEmpty());
}
@Test
public void deleteSomeByRegion() {
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
mSortedSet.add(e);
for (int x = 1; x < 50; x++) {
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
}
assertTrue(!mSortedSet.isEmpty());
assertTrue(mSortedSet.size() == 1);
GenomeLoc loc = mSortedSet.iterator().next();
assertTrue(loc.getStop() == 100);
assertTrue(loc.getStart() == 50);
}
@Test
public void deleteSuperRegion() {
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 10, 20);
GenomeLoc g = GenomeLocParser.createGenomeLoc(1, 70, 100);
mSortedSet.add(g);
mSortedSet.addRegion(e);
assertTrue(mSortedSet.size() == 2);
// now delete a region
GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d));
Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 10);
assertTrue(loc.getStop() == 14);
assertTrue(loc.getContigIndex() == 1);
loc = iter.next();
assertTrue(loc.getStart() == 76);
assertTrue(loc.getStop() == 100);
assertTrue(loc.getContigIndex() == 1);
}
@Test
public void substractComplexExample() {
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 20);
mSortedSet.add(e);
GenomeLoc r1 = GenomeLocParser.createGenomeLoc(1, 3, 5);
GenomeLoc r2 = GenomeLocParser.createGenomeLoc(1, 10, 12);
GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18);
GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3));
GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude);
// logger.debug("Initial " + mSortedSet);
// logger.debug("Exclude " + toExclude);
// logger.debug("Remaining " + remaining);
assertEquals(20, mSortedSet.coveredSize());
assertEquals(9, toExclude.coveredSize());
assertEquals(11, remaining.coveredSize());
Iterator<GenomeLoc> it = remaining.iterator();
GenomeLoc p1 = it.next();
GenomeLoc p2 = it.next();
GenomeLoc p3 = it.next();
GenomeLoc p4 = it.next();
assertEquals(GenomeLocParser.createGenomeLoc(1, 1, 2), p1);
assertEquals(GenomeLocParser.createGenomeLoc(1, 6, 9), p2);
assertEquals(GenomeLocParser.createGenomeLoc(1, 13, 15), p3);
assertEquals(GenomeLocParser.createGenomeLoc(1, 19, 20), p4);
}
@Test
public void fromSequenceDictionary() {