Unit tests for -XL arguments; expt. annotation calculating the GC content within 100 bp of the current SNP
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2997 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e367a50e9b
commit
4dd7c5972c
|
|
@ -158,7 +158,22 @@ public class GenomeAnalysisEngine {
|
|||
// create the output streams
|
||||
initializeOutputStreams(my_walker, microScheduler.getOutputTracker());
|
||||
|
||||
// todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
|
||||
initializeIntervals();
|
||||
|
||||
ShardStrategy shardStrategy = getShardStrategy(my_walker,
|
||||
microScheduler.getReference(),
|
||||
intervals,
|
||||
argCollection.maximumEngineIterations,
|
||||
readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
|
||||
// execute the microscheduler, storing the results
|
||||
return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup the intervals to be processed
|
||||
*/
|
||||
private void initializeIntervals() {
|
||||
GenomeLocSortedSet excludeIntervals = null;
|
||||
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
|
||||
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
|
||||
|
|
@ -171,17 +186,18 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
if ( excludeIntervals != null ) {
|
||||
GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
|
||||
intervals = pruneIntervals( toPrune, excludeIntervals );
|
||||
long toPruneSize = toPrune.coveredSize();
|
||||
long toExcludeSize = excludeIntervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
|
||||
logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
|
||||
intervals = toPrune.substractRegions( excludeIntervals );
|
||||
long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
ShardStrategy shardStrategy = getShardStrategy(my_walker,
|
||||
microScheduler.getReference(),
|
||||
intervals,
|
||||
argCollection.maximumEngineIterations,
|
||||
readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
|
||||
// execute the microscheduler, storing the results
|
||||
return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
|
||||
if ( intervals != null )
|
||||
logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -306,16 +322,6 @@ public class GenomeAnalysisEngine {
|
|||
return microScheduler;
|
||||
}
|
||||
|
||||
private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
|
||||
logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
|
||||
//for ( GenomeLoc exclude : toExclude )
|
||||
// toPrune.removeRegion(exclude);
|
||||
GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
|
||||
logger.info(String.format("done pruning intervals == now have %d", x.size()));
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* setup the interval regions, from either the interval file of the genome region string
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class GCContent implements VariantAnnotation {
|
||||
|
||||
public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
double content = computeGCContent(ref);
|
||||
return String.format("%.2f", content);
|
||||
}
|
||||
|
||||
public String getKeyName() { return "GC"; }
|
||||
|
||||
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GC", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "GC content within 20 bp +/- the variant"); }
|
||||
|
||||
public boolean useZeroQualityReads() { return false; }
|
||||
|
||||
private static double computeGCContent(ReferenceContext ref) {
|
||||
int gc = 0, at = 0;
|
||||
|
||||
for ( char base : ref.getBases() ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex )
|
||||
gc++;
|
||||
else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex )
|
||||
at++;
|
||||
else
|
||||
; // ignore
|
||||
}
|
||||
|
||||
int sum = gc + at;
|
||||
return (100.0*gc) / (sum == 0 ? 1 : sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -20,7 +20,7 @@ import java.io.*;
|
|||
*/
|
||||
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
|
||||
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-20,stop=20))
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
@By(DataSource.REFERENCE)
|
||||
public class VariantAnnotator extends LocusWalker<Integer, Integer> {
|
||||
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
|
||||
|
|
|
|||
|
|
@ -14,6 +14,12 @@ public class BaseUtils {
|
|||
public static final byte DELETION_INDEX = 4;
|
||||
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
||||
|
||||
public static int gIndex = BaseUtils.simpleBaseToBaseIndex('G');
|
||||
public static int cIndex = BaseUtils.simpleBaseToBaseIndex('C');
|
||||
public static int aIndex = BaseUtils.simpleBaseToBaseIndex('A');
|
||||
public static int tIndex = BaseUtils.simpleBaseToBaseIndex('T');
|
||||
|
||||
|
||||
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
||||
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
||||
// Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
|
||||
|
|
|
|||
|
|
@ -44,6 +44,18 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
public GenomeLocSortedSet() {
|
||||
}
|
||||
|
||||
public GenomeLocSortedSet(GenomeLoc e) {
|
||||
this();
|
||||
add(e);
|
||||
}
|
||||
|
||||
public GenomeLocSortedSet(Collection<GenomeLoc> l) {
|
||||
this();
|
||||
|
||||
for ( GenomeLoc e : l )
|
||||
add(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* get an iterator over this collection
|
||||
*
|
||||
|
|
@ -62,6 +74,17 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return mArray.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the size, in bp, of the genomic regions by all of the regions in this set
|
||||
* @return size in bp of the covered regions
|
||||
*/
|
||||
public long coveredSize() {
|
||||
long s = 0;
|
||||
for ( GenomeLoc e : this )
|
||||
s += e.size();
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* determine if the collection is empty
|
||||
*
|
||||
|
|
@ -180,7 +203,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return GenomeLocSortedSet.createSetFromList(good);
|
||||
}
|
||||
|
||||
|
||||
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
|
||||
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
|
||||
if (g.equals(e)) {
|
||||
|
|
@ -243,39 +265,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
}
|
||||
|
||||
|
||||
// public boolean removeRegions(GenomeLocSortedSet toRemove) {
|
||||
// int i = 0, j = 0;
|
||||
//
|
||||
// while ( true ) {
|
||||
// if ( i >= mArray.size() || j >= toRemove.mArray.size() )
|
||||
// return true;
|
||||
//
|
||||
// GenomeLoc g = mArray.get(i);
|
||||
// GenomeLoc e = toRemove.mArray.get(j);
|
||||
//
|
||||
// if (g.overlapsP(e)) {
|
||||
// boolean finishEarly = removeOverlappingRegion(g, e);
|
||||
// if ( finishEarly ) {
|
||||
// i++;
|
||||
// }
|
||||
// } else if ( g.compareContigs(e) < 0 ) {
|
||||
// i++; // g is on an earlier contig, move g forward
|
||||
// } else if ( g.compareContigs(e) > 0 ) {
|
||||
// j++; // g is on a later contig, move e forward
|
||||
// } else if ( g.getStop() < e.getStart() ) {
|
||||
// i++; // g stops before e starts, move g forward
|
||||
// } else if ( e.getStop() < g.getStart() ) {
|
||||
// j++; // g starts after e stops, move e forward
|
||||
// } else {
|
||||
// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
|
||||
// }
|
||||
//
|
||||
// if ( (i + j) % 10000 == 0 )
|
||||
// logger.debug("removeRegions operation: i + j = " + ( i + j ));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
|
||||
/**
|
||||
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
|
||||
* @param location the GenomeLoc to remove
|
||||
|
|
@ -373,4 +362,15 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return this.mArray;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder();
|
||||
s.append("[");
|
||||
for ( GenomeLoc e : this ) {
|
||||
s.append(" ");
|
||||
s.append(e.toString());
|
||||
}
|
||||
s.append("]");
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -120,10 +121,87 @@ public class GenomeLocSortedSetTest extends BaseTest {
|
|||
assertTrue(loc.getContigIndex() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void deleteAllByRegion() {
|
||||
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
|
||||
mSortedSet.add(e);
|
||||
for (int x = 1; x < 101; x++) {
|
||||
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
|
||||
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
|
||||
}
|
||||
assertTrue(mSortedSet.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void deleteSomeByRegion() {
|
||||
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
|
||||
mSortedSet.add(e);
|
||||
for (int x = 1; x < 50; x++) {
|
||||
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
|
||||
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
|
||||
}
|
||||
assertTrue(!mSortedSet.isEmpty());
|
||||
assertTrue(mSortedSet.size() == 1);
|
||||
GenomeLoc loc = mSortedSet.iterator().next();
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getStart() == 50);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void deleteSuperRegion() {
|
||||
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 10, 20);
|
||||
GenomeLoc g = GenomeLocParser.createGenomeLoc(1, 70, 100);
|
||||
mSortedSet.add(g);
|
||||
mSortedSet.addRegion(e);
|
||||
assertTrue(mSortedSet.size() == 2);
|
||||
// now delete a region
|
||||
GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75);
|
||||
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d));
|
||||
Iterator<GenomeLoc> iter = mSortedSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 10);
|
||||
assertTrue(loc.getStop() == 14);
|
||||
assertTrue(loc.getContigIndex() == 1);
|
||||
|
||||
loc = iter.next();
|
||||
assertTrue(loc.getStart() == 76);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void substractComplexExample() {
|
||||
GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 20);
|
||||
mSortedSet.add(e);
|
||||
|
||||
GenomeLoc r1 = GenomeLocParser.createGenomeLoc(1, 3, 5);
|
||||
GenomeLoc r2 = GenomeLocParser.createGenomeLoc(1, 10, 12);
|
||||
GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18);
|
||||
GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3));
|
||||
|
||||
GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude);
|
||||
// logger.debug("Initial " + mSortedSet);
|
||||
// logger.debug("Exclude " + toExclude);
|
||||
// logger.debug("Remaining " + remaining);
|
||||
|
||||
assertEquals(20, mSortedSet.coveredSize());
|
||||
assertEquals(9, toExclude.coveredSize());
|
||||
assertEquals(11, remaining.coveredSize());
|
||||
|
||||
Iterator<GenomeLoc> it = remaining.iterator();
|
||||
GenomeLoc p1 = it.next();
|
||||
GenomeLoc p2 = it.next();
|
||||
GenomeLoc p3 = it.next();
|
||||
GenomeLoc p4 = it.next();
|
||||
|
||||
assertEquals(GenomeLocParser.createGenomeLoc(1, 1, 2), p1);
|
||||
assertEquals(GenomeLocParser.createGenomeLoc(1, 6, 9), p2);
|
||||
assertEquals(GenomeLocParser.createGenomeLoc(1, 13, 15), p3);
|
||||
assertEquals(GenomeLocParser.createGenomeLoc(1, 19, 20), p4);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void fromSequenceDictionary() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue