Reorganized GenomeLoc code to more clearly and better use the picard SequenceDictionary information.
All GenomeLoc[] are not ArrayList<GenomeLoc> for clarity and consistency Parsing now recursively merges contiguous elements chr1:1-10;chr1:11-20 => chr1:1-20 Added support for TraversingByLoci over all reference positions specified by the provided location array. System dynamically determines which traversal system to use. Pileup now marks, very clearly, reference positions without covered reads. Made changes around the codebase to deal with new GenomeLoc structure. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@218 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c2ae6765a3
commit
d7c0bcc223
|
|
@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
|
|||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseByReads;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseByLoci;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseByLociByReference;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -187,7 +188,10 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
// Try to get the walker specified
|
||||
try {
|
||||
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
|
||||
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
if ( WALK_ALL_LOCI )
|
||||
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
else
|
||||
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
}
|
||||
catch (java.lang.ClassCastException e) {
|
||||
// I guess we're a read walker LOL
|
||||
|
|
@ -198,16 +202,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||
if (REF_FILE_ARG != null) {
|
||||
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
|
||||
|
||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||
int i = 0;
|
||||
for (SAMSequenceRecord contig : refContigs) {
|
||||
refContigOrdering.put(contig.getSequenceName(), i);
|
||||
i++;
|
||||
}
|
||||
|
||||
GenomeLoc.setContigOrdering(refContigOrdering);
|
||||
GenomeLoc.setupRefContigOrdering(refFile);
|
||||
}
|
||||
ValidationStringency strictness;
|
||||
if (STRICTNESS_ARG == null) {
|
||||
|
|
@ -232,6 +227,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
if (INTERVALS_FILE != null) {
|
||||
engine.setLocationFromFile(INTERVALS_FILE);
|
||||
}
|
||||
|
||||
engine.setSafetyChecking(!UNSAFE);
|
||||
engine.setSortOnFly(ENABLED_SORT_ON_FLY);
|
||||
engine.setThreadedIO(ENABLED_THREADED_IO);
|
||||
|
|
@ -277,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
*/
|
||||
private static void testNewReferenceFeatures(final File refFileName) {
|
||||
final FastaSequenceFile2 refFile = new FastaSequenceFile2(refFileName);
|
||||
Utils.setupRefContigOrdering(refFile);
|
||||
GenomeLoc.setupRefContigOrdering(refFile);
|
||||
|
||||
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
|
||||
|
||||
|
|
|
|||
|
|
@ -24,14 +24,14 @@ public class SamQueryIterator implements Iterator<SAMRecord> {
|
|||
Iterator<GenomeLoc> locIter = null;
|
||||
CloseableIterator<SAMRecord> recordIter = null;
|
||||
|
||||
public SamQueryIterator( SAMFileReader reader, GenomeLoc[] locs ) {
|
||||
public SamQueryIterator( SAMFileReader reader, ArrayList<GenomeLoc> locs ) {
|
||||
this.reader = reader;
|
||||
|
||||
// Our internal contract for the class guarantees that locIter and recordIter are never null.
|
||||
// Initialize them and seed them with empty data as necessary.
|
||||
if(locs != null) {
|
||||
// The user requested a specific set of locations, set up the iterators accordly.
|
||||
locIter = Arrays.asList(locs).iterator();
|
||||
locIter = locs.iterator();
|
||||
recordIter = new NullCloseableIterator<SAMRecord>();
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public abstract class TraversalEngine {
|
|||
|
||||
|
||||
// Locations we are going to process during the traversal
|
||||
protected GenomeLoc[] locs = null;
|
||||
private ArrayList<GenomeLoc> locs = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -175,7 +175,7 @@ public abstract class TraversalEngine {
|
|||
* @param locStr
|
||||
*/
|
||||
public void setLocation(final String locStr) {
|
||||
this.locs = parseGenomeLocs(locStr);
|
||||
this.locs = GenomeLoc.parseGenomeLocs(locStr);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -194,77 +194,19 @@ public abstract class TraversalEngine {
|
|||
reader.close();
|
||||
String locStr = Utils.join(";", lines);
|
||||
logger.debug("locStr: " + locStr);
|
||||
this.locs = parseGenomeLocs(locStr);
|
||||
setLocation(locStr);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful utility function that parses a location string into a coordinate-order sorted
|
||||
* array of GenomeLoc objects
|
||||
*
|
||||
* @param str
|
||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
||||
*/
|
||||
public static GenomeLoc[] parseGenomeLocs(final String str) {
|
||||
// Of the form: loc1;loc2;...
|
||||
// Where each locN can be:
|
||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
StdReflect reflect = new JdkStdReflect();
|
||||
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
|
||||
Function1<GenomeLoc, String> f1 = parseOne.f1();
|
||||
try {
|
||||
Collection<GenomeLoc> result = Functions.map(f1, Arrays.asList(str.split(";")));
|
||||
GenomeLoc[] locs = (GenomeLoc[]) result.toArray(new GenomeLoc[0]);
|
||||
Arrays.sort(locs);
|
||||
logger.info(String.format("Going to process %d locations", locs.length));
|
||||
//System.out.println(" Locations are: " + Utils.join("\n", Functions.map(Operators.toString, Arrays.asList(locs))));
|
||||
return locs;
|
||||
} catch (Exception e) {
|
||||
logger.fatal(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str));
|
||||
throw new IllegalArgumentException("Invalid locations string: " + str + ", format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A key function that returns true if the proposed GenomeLoc curr is within the list of
|
||||
* locations we are processing in this TraversalEngine
|
||||
*
|
||||
* @param curr
|
||||
* @return true if we should process GenomeLoc curr, otherwise false
|
||||
*/
|
||||
public boolean inLocations(GenomeLoc curr) {
|
||||
if (this.locs == null) {
|
||||
return true;
|
||||
} else {
|
||||
for ( GenomeLoc loc : this.locs ) {
|
||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||
if (loc.overlapsP(curr))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasLocations() {
|
||||
return this.locs != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true iff we have a specified series of locations to process AND we are past the last
|
||||
* location in the list. It means that, in a serial processing of the genome, that we are done.
|
||||
*
|
||||
* @param curr Current genome Location
|
||||
* @return true if we are past the last location to process
|
||||
*/
|
||||
protected boolean pastFinalLocation(GenomeLoc curr) {
|
||||
boolean r = locs != null && locs[locs.length - 1].compareTo(curr) == -1 && !locs[locs.length - 1].overlapsP(curr);
|
||||
//System.out.printf(" pastFinalLocation %s vs. %s => %d => %b%n", locs[locs.length-1], curr, locs[locs.length-1].compareTo( curr ), r);
|
||||
return r;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// printing
|
||||
|
|
@ -433,11 +375,11 @@ public abstract class TraversalEngine {
|
|||
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
|
||||
this.refFile = new FastaSequenceFile2(refFileName); // todo: replace when FastaSequenceFile2 is in picard
|
||||
this.refIter = new ReferenceIterator(this.refFile);
|
||||
if (!Utils.setupRefContigOrdering(this.refFile)) {
|
||||
// We couldn't process the reference contig ordering, fail since we need it
|
||||
logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
|
||||
throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
|
||||
}
|
||||
// if (!GenomeLoc.setupRefContigOrdering(this.refFile)) {
|
||||
// // We couldn't process the reference contig ordering, fail since we need it
|
||||
// logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
|
||||
// throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -510,14 +452,14 @@ public abstract class TraversalEngine {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public <M, T> T traverse(Walker<M, T> walker) {
|
||||
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||
if ( hasLocations() )
|
||||
l = Arrays.asList(locs);
|
||||
l = locs;
|
||||
|
||||
return traverse(walker, l);
|
||||
}
|
||||
|
||||
public <M, T> T traverse(Walker<M, T> walker, List<GenomeLoc> locations) {
|
||||
public <M, T> T traverse(Walker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
|
@ -34,7 +35,7 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( walker instanceof LocusWalker ) {
|
||||
Walker x = walker;
|
||||
LocusWalker<?, ?> locusWalker = (LocusWalker<?, ?>)x;
|
||||
|
|
@ -54,7 +55,9 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
* @param <T> ReduceType -- the result of calling reduce() on the walker
|
||||
* @return 0 on success
|
||||
*/
|
||||
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, List<GenomeLoc> locations) {
|
||||
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
logger.debug("Entering traverseByLoci");
|
||||
|
||||
samReader = initializeSAMFile(readsFile);
|
||||
|
||||
verifySortOrder(true);
|
||||
|
|
@ -63,9 +66,14 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
walker.initialize();
|
||||
|
||||
T sum = walker.reduceInit();
|
||||
if ( samReader.hasIndex() && hasLocations() ) {
|
||||
if ( ! locations.isEmpty() ) {
|
||||
logger.debug("Doing interval-based traversal");
|
||||
|
||||
if ( ! samReader.hasIndex() )
|
||||
Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting.");
|
||||
|
||||
// we are doing interval-based traversals
|
||||
for ( GenomeLoc interval : locs ) {
|
||||
for ( GenomeLoc interval : locations ) {
|
||||
logger.debug(String.format("Processing locus %s", interval.toString()));
|
||||
|
||||
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
|
||||
|
|
@ -79,6 +87,7 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
}
|
||||
else {
|
||||
// We aren't locus oriented
|
||||
logger.debug("Doing non-interval-based traversal");
|
||||
samReadIter = WrapReadsIterator(getReadsIterator(samReader), true);
|
||||
sum = carryWalkerOverInterval(walker, samReadIter, sum, null);
|
||||
}
|
||||
|
|
@ -89,6 +98,8 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval( LocusWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
|
||||
logger.debug(String.format("TraverseByLoci.carryWalkerOverInterval Genomic interval is %s", interval));
|
||||
|
||||
// prepare the read filtering read iterator and provide it to a new locus iterator
|
||||
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
|
||||
|
||||
|
|
@ -102,38 +113,49 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
|
||||
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
||||
// location
|
||||
if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
|
||||
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
if ( interval == null || interval.overlapsP(locus.getLocation()) ) {
|
||||
ReferenceIterator refSite = refIter.seekForward(locus.getLocation());
|
||||
final char refBase = refSite.getBaseAsChar();
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation());
|
||||
|
||||
logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
|
||||
walkAtLocus( walker, sum, locus, refSite, rodData );
|
||||
|
||||
//
|
||||
// Execute our contract with the walker. Call filter, map, and reduce
|
||||
//
|
||||
final boolean keepMeP = walker.filter(rodData, refBase, locus);
|
||||
if (keepMeP) {
|
||||
M x = walker.map(rodData, refBase, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
||||
done = true;
|
||||
}
|
||||
|
||||
printProgress("loci", locus.getLocation());
|
||||
if (pastFinalLocation(locus.getLocation()))
|
||||
done = true;
|
||||
}
|
||||
|
||||
done = interval != null && locus.getLocation().isPast(interval);
|
||||
//System.out.printf("done is %b, %s vs. %s%n", done, interval, locus.getLocation());
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T walkAtLocus( final LocusWalker<M, T> walker,
|
||||
T sum,
|
||||
final LocusContext locus,
|
||||
final ReferenceIterator refSite,
|
||||
final List<ReferenceOrderedDatum> rodData ) {
|
||||
final char refBase = refSite.getBaseAsChar();
|
||||
|
||||
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
|
||||
|
||||
//
|
||||
// Execute our contract with the walker. Call filter, map, and reduce
|
||||
//
|
||||
final boolean keepMeP = walker.filter(rodData, refBase, locus);
|
||||
if (keepMeP) {
|
||||
M x = walker.map(rodData, refBase, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
printProgress("loci", locus.getLocation());
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
|
||||
/**
|
||||
* A simple, short-term solution to iterating over all reference positions over a series of
|
||||
* genomic locations. Simply overloads the superclass traverse function to go over the entire
|
||||
* interval's reference positions.
|
||||
*/
|
||||
public class TraverseByLociByReference extends TraverseByLoci {
|
||||
|
||||
public TraverseByLociByReference(File reads, File ref, List<ReferenceOrderedData> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( locations.isEmpty() )
|
||||
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
||||
|
||||
return super.traverse(walker, locations);
|
||||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval( LocusWalker<M, T> walker,
|
||||
Iterator<SAMRecord> readIter,
|
||||
T sum,
|
||||
GenomeLoc interval ) {
|
||||
logger.debug(String.format("TraverseByLociByReference.carryWalkerOverInterval Genomic interval is %s", interval));
|
||||
|
||||
boolean done = false;
|
||||
|
||||
List<SAMRecord> NO_READS = new ArrayList<SAMRecord>();
|
||||
List<Integer> NO_OFFSETS = new ArrayList<Integer>();
|
||||
|
||||
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
|
||||
LocusIterator locusIter = new LocusIteratorByHanger(filterIter); // prepare the iterator by loci from reads
|
||||
ReferenceIterator refSite = refIter.seekForward(interval); // jump to the first reference site
|
||||
LocusContext locusFromReads = advanceReadsToLoc(locusIter, interval); // load up the next locus by reads
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while ( interval.containsP(refSite.getLocation()) && ! done ) {
|
||||
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
|
||||
|
||||
this.nRecords++;
|
||||
GenomeLoc current = refSite.getLocation();
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, current);
|
||||
|
||||
LocusContext locus = null;
|
||||
|
||||
if ( locusFromReads != null && locusFromReads.getLocation().compareTo(current) == 0 ) { // we are at the same site
|
||||
locus = locusFromReads;
|
||||
if ( locusIter.hasNext() )
|
||||
locusFromReads = locusIter.next(); // advance the iterator
|
||||
}
|
||||
else
|
||||
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
|
||||
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
||||
done = true;
|
||||
}
|
||||
|
||||
refSite = refIter.next(); // update our location
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) {
|
||||
if ( ! locusIter.hasNext() )
|
||||
return null;
|
||||
|
||||
LocusContext locus = locusIter.next();
|
||||
|
||||
while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) {
|
||||
logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target)));
|
||||
locus = locusIter.next();
|
||||
}
|
||||
|
||||
logger.debug(String.format(" returning %s", locus.getLocation()));
|
||||
return locus;
|
||||
}
|
||||
}
|
||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
|
@ -28,7 +29,7 @@ public class TraverseByReads extends TraversalEngine {
|
|||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( walker instanceof ReadWalker ) {
|
||||
Walker x = walker;
|
||||
ReadWalker<?, ?> readWalker = (ReadWalker<?, ?>)x;
|
||||
|
|
@ -46,10 +47,10 @@ public class TraverseByReads extends TraversalEngine {
|
|||
*
|
||||
* @param walker A read walker object
|
||||
* @param <M> MapType -- the result of calling map() on walker
|
||||
* @param <R> ReduceType -- the result of calling reduce() on the walker
|
||||
* @param <> ReduceType -- the result of calling reduce() on the walker
|
||||
* @return 0 on success
|
||||
*/
|
||||
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, List<GenomeLoc> locations) {
|
||||
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
samReadIter = initializeReads();
|
||||
|
||||
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
|
||||
|
|
@ -83,7 +84,7 @@ public class TraverseByReads extends TraversalEngine {
|
|||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
}
|
||||
|
||||
if (inLocations(loc)) {
|
||||
if (GenomeLoc.inLocations(loc, locations)) {
|
||||
|
||||
//
|
||||
// execute the walker contact
|
||||
|
|
@ -101,7 +102,7 @@ public class TraverseByReads extends TraversalEngine {
|
|||
}
|
||||
printProgress("reads", loc);
|
||||
|
||||
if (pastFinalLocation(loc))
|
||||
if (GenomeLoc.pastFinalLocation(loc, locations))
|
||||
done = true;
|
||||
//System.out.printf("Done? %b%n", done);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
|
@ -15,6 +16,8 @@ import java.util.List;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PileupWalker extends LocusWalker<Integer, Integer> {
|
||||
public boolean FLAG_UNCOVERED_BASES = true; // todo: how do I make this a command line argument?
|
||||
|
||||
public void initialize() {
|
||||
}
|
||||
|
||||
|
|
@ -37,6 +40,10 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
|
|||
quals += read.getBaseQualityString().charAt(offset);
|
||||
}
|
||||
|
||||
if ( bases.equals("") && FLAG_UNCOVERED_BASES ) {
|
||||
bases = "*** UNCOVERED SITE ***";
|
||||
}
|
||||
|
||||
String rodString = "";
|
||||
for ( ReferenceOrderedDatum datum : rodData ) {
|
||||
if ( datum != null ) {
|
||||
|
|
|
|||
|
|
@ -47,16 +47,7 @@ public class PrepareROD extends CommandLineProgram {
|
|||
|
||||
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
|
||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||
|
||||
int i = 0;
|
||||
for ( SAMSequenceRecord contig : refContigs ) {
|
||||
System.out.println(contig.getSequenceName());
|
||||
refContigOrdering.put(contig.getSequenceName(), i);
|
||||
i++;
|
||||
}
|
||||
GenomeLoc.setContigOrdering(refContigOrdering);
|
||||
GenomeLoc.setupRefContigOrdering(refFile);
|
||||
|
||||
Class rodClass = Types.get(ROD_TYPE.toLowerCase());
|
||||
|
||||
|
|
|
|||
|
|
@ -259,19 +259,7 @@ public class IndelInspector extends CommandLineProgram {
|
|||
setDefaultContigOrdering();
|
||||
return;
|
||||
}
|
||||
List<SAMSequenceRecord> seqs = h.getSequenceDictionary().getSequences();
|
||||
if ( seqs == null ) {
|
||||
System.out.println("No reference sequence records found in SAM file header, " +
|
||||
"falling back to default contig ordering");
|
||||
setDefaultContigOrdering();
|
||||
return;
|
||||
}
|
||||
int i = 0;
|
||||
Map<String,Integer> rco = new HashMap<String,Integer>();
|
||||
for ( SAMSequenceRecord sr : seqs) {
|
||||
rco.put(sr.getSequenceName(),i++);
|
||||
}
|
||||
GenomeLoc.setContigOrdering(rco);
|
||||
GenomeLoc.setupRefContigOrdering(h.getSequenceDictionary());
|
||||
}
|
||||
|
||||
private void setDefaultContigOrdering() {
|
||||
|
|
|
|||
|
|
@ -1,9 +1,21 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.functionalj.reflect.StdReflect;
|
||||
import net.sf.functionalj.reflect.JdkStdReflect;
|
||||
import net.sf.functionalj.FunctionN;
|
||||
import net.sf.functionalj.Function1;
|
||||
import net.sf.functionalj.Functions;
|
||||
import net.sf.functionalj.util.Operators;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -15,23 +27,90 @@ import java.util.regex.Matcher;
|
|||
*
|
||||
*/
|
||||
public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||
private static Logger logger = Logger.getLogger(GenomeLoc.class);
|
||||
|
||||
private String contig;
|
||||
private long start;
|
||||
private long stop;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Ugly global variable defining the optional ordering of contig elements
|
||||
//
|
||||
public static Map<String, Integer> refContigOrdering = null;
|
||||
public static HashMap<String, String> interns = null;
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//public static Map<String, Integer> refContigOrdering = null;
|
||||
private static SAMSequenceDictionary contigInfo = null;
|
||||
private static HashMap<String, String> interns = null;
|
||||
|
||||
public static boolean hasKnownContigOrdering() {
|
||||
return contigInfo != null;
|
||||
}
|
||||
|
||||
public static SAMSequenceRecord getContigInfo( final String contig ) {
|
||||
return contigInfo.getSequence(contig);
|
||||
}
|
||||
|
||||
public static int getContigIndex( final String contig ) {
|
||||
return contigInfo.getSequenceIndex(contig);
|
||||
}
|
||||
|
||||
/*
|
||||
public static void setContigOrdering(Map<String, Integer> rco) {
|
||||
refContigOrdering = rco;
|
||||
interns = new HashMap<String, String>();
|
||||
for ( String contig : rco.keySet() )
|
||||
interns.put( contig, contig );
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
|
||||
final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary();
|
||||
|
||||
if (seqDict == null) // we couldn't load the reference dictionary
|
||||
return false;
|
||||
|
||||
List<SAMSequenceRecord> refContigs = seqDict.getSequences();
|
||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||
|
||||
if (refContigs != null) {
|
||||
int i = 0;
|
||||
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
|
||||
for (SAMSequenceRecord contig : refContigs) {
|
||||
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
||||
refContigOrdering.put(contig.getSequenceName(), i);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
setContigOrdering(refContigOrdering);
|
||||
return refContigs != null;
|
||||
}
|
||||
*/
|
||||
|
||||
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
|
||||
return setupRefContigOrdering(refFile.getSequenceDictionary());
|
||||
}
|
||||
|
||||
public static boolean setupRefContigOrdering(final SAMSequenceDictionary seqDict) {
|
||||
if (seqDict == null) // we couldn't load the reference dictionary
|
||||
return false;
|
||||
else {
|
||||
contigInfo = seqDict;
|
||||
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
|
||||
for (SAMSequenceRecord contig : seqDict.getSequences() ) {
|
||||
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public GenomeLoc( String contig, final long start, final long stop ) {
|
||||
if ( interns != null )
|
||||
contig = interns.get(contig);
|
||||
|
|
@ -49,9 +128,11 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Parsing string representations
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private static long parsePosition( final String pos ) {
|
||||
String x = pos.replaceAll(",", "");
|
||||
return Long.parseLong(x);
|
||||
|
|
@ -101,12 +182,104 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
throw new RuntimeException("Invalid Genome Location string: " + str);
|
||||
}
|
||||
|
||||
if ( stop == Integer.MAX_VALUE && hasKnownContigOrdering() ) {
|
||||
// lookup the actually stop position!
|
||||
stop = getContigInfo(contig).getSequenceLength();
|
||||
}
|
||||
|
||||
GenomeLoc loc = new GenomeLoc(contig, start, stop);
|
||||
//System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
|
||||
|
||||
return loc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful utility function that parses a location string into a coordinate-order sorted
|
||||
* array of GenomeLoc objects
|
||||
*
|
||||
* @param str
|
||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
||||
*/
|
||||
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
||||
// Of the form: loc1;loc2;...
|
||||
// Where each locN can be:
|
||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
StdReflect reflect = new JdkStdReflect();
|
||||
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
|
||||
Function1<GenomeLoc, String> f1 = parseOne.f1();
|
||||
try {
|
||||
Collection<GenomeLoc> result = Functions.map(f1, Arrays.asList(str.split(";")));
|
||||
ArrayList<GenomeLoc> locs = new ArrayList(result);
|
||||
Collections.sort(locs);
|
||||
//logger.info(String.format("Going to process %d locations", locs.length));
|
||||
locs = mergeOverlappingLocations(locs);
|
||||
logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs)));
|
||||
return locs;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
Utils.scareUser(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str));
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static ArrayList<GenomeLoc> mergeOverlappingLocations(final ArrayList<GenomeLoc> raw) {
|
||||
logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw)));
|
||||
if ( raw.size() <= 1 )
|
||||
return raw;
|
||||
else {
|
||||
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
|
||||
Iterator<GenomeLoc> it = raw.iterator();
|
||||
GenomeLoc prev = it.next();
|
||||
while ( it.hasNext() ) {
|
||||
GenomeLoc curr = it.next();
|
||||
if ( prev.contiguousP(curr) ) {
|
||||
prev = prev.merge(curr);
|
||||
} else {
|
||||
merged.add(prev);
|
||||
prev = curr;
|
||||
}
|
||||
}
|
||||
merged.add(prev);
|
||||
return merged;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true iff we have a specified series of locations to process AND we are past the last
|
||||
* location in the list. It means that, in a serial processing of the genome, that we are done.
|
||||
*
|
||||
* @param curr Current genome Location
|
||||
* @return true if we are past the last location to process
|
||||
*/
|
||||
public static boolean pastFinalLocation(GenomeLoc curr, ArrayList<GenomeLoc> locs) {
|
||||
if ( locs == null )
|
||||
return false;
|
||||
else {
|
||||
GenomeLoc last = locs.get(locs.size() - 1);
|
||||
return last.compareTo(curr) == -1 && ! last.overlapsP(curr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A key function that returns true if the proposed GenomeLoc curr is within the list of
|
||||
* locations we are processing in this TraversalEngine
|
||||
*
|
||||
* @param curr
|
||||
* @return true if we should process GenomeLoc curr, otherwise false
|
||||
*/
|
||||
public static boolean inLocations(GenomeLoc curr, ArrayList<GenomeLoc> locs) {
|
||||
if (locs == null) {
|
||||
return true;
|
||||
} else {
|
||||
for ( GenomeLoc loc : locs ) {
|
||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||
if (loc.overlapsP(curr))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Accessors and setters
|
||||
//
|
||||
|
|
@ -147,10 +320,34 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
return false;
|
||||
}
|
||||
|
||||
public final boolean discontinuousP(GenomeLoc that) {
|
||||
if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes
|
||||
if ( (this.start - 1) > that.stop ) return true; // this guy is past that
|
||||
if ( (that.start - 1) > this.stop ) return true; // that guy is past our start
|
||||
return false;
|
||||
}
|
||||
|
||||
public final boolean overlapsP(GenomeLoc that) {
|
||||
return ! disjointP( that );
|
||||
}
|
||||
|
||||
public final boolean contiguousP(GenomeLoc that) {
|
||||
return ! discontinuousP( that );
|
||||
}
|
||||
|
||||
public GenomeLoc merge( GenomeLoc that ) {
|
||||
assert this.contiguousP(that);
|
||||
|
||||
return new GenomeLoc(getContig(),
|
||||
Math.min(getStart(), that.getStart()),
|
||||
Math.max( getStop(), that.getStop()) );
|
||||
}
|
||||
|
||||
public final boolean containsP(GenomeLoc that) {
|
||||
if ( ! onSameContig(that) ) return false;
|
||||
return getStart() <= that.getStart() && getStop() >= that.getStop();
|
||||
}
|
||||
|
||||
public final boolean onSameContig(GenomeLoc that) {
|
||||
return this.contig.equals(that.contig);
|
||||
}
|
||||
|
|
@ -170,6 +367,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
|
||||
}
|
||||
|
||||
public final boolean isPast( GenomeLoc that ) {
|
||||
return this.compareContigs(that) == 1 || this.getStart() > that.getStop();
|
||||
}
|
||||
|
||||
public final void incPos() {
|
||||
incPos(1);
|
||||
}
|
||||
|
|
@ -194,14 +395,17 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
assert refContigOrdering.containsKey(thisContig);// : this;
|
||||
assert refContigOrdering.containsKey(thatContig);// : that;
|
||||
//assert getContigIndex(thisContig) != -1;// : this;
|
||||
//assert getContigIndex(thatContig) != -1;// : that;
|
||||
|
||||
if ( refContigOrdering != null )
|
||||
if ( hasKnownContigOrdering() )
|
||||
{
|
||||
if ( ! refContigOrdering.containsKey(thisContig) )
|
||||
int thisIndex = getContigIndex(thisContig);
|
||||
int thatIndex = getContigIndex(thatContig);
|
||||
|
||||
if ( thisIndex == -1 )
|
||||
{
|
||||
if ( ! refContigOrdering.containsKey(thatContig) )
|
||||
if ( thatIndex == -1 )
|
||||
{
|
||||
// Use regular sorted order
|
||||
return thisContig.compareTo(thatContig);
|
||||
|
|
@ -212,20 +416,14 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
return 1;
|
||||
}
|
||||
}
|
||||
else if ( ! refContigOrdering.containsKey(thatContig) )
|
||||
else if ( thatIndex == -1 )
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert refContigOrdering.containsKey(thisContig);// : this;
|
||||
assert refContigOrdering.containsKey(thatContig);// : that;
|
||||
|
||||
final int thisO = refContigOrdering.get(thisContig);
|
||||
final int thatO = refContigOrdering.get(thatContig);
|
||||
|
||||
if ( thisO < thatO ) return -1;
|
||||
if ( thisO > thatO ) return 1;
|
||||
if ( thisIndex < thatIndex ) return -1;
|
||||
if ( thisIndex > thatIndex ) return 1;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,19 +27,19 @@ public class Utils {
|
|||
private static Logger logger = Logger.getLogger(FileProgressTracker.class);
|
||||
|
||||
public static void warnUser(final String msg) {
|
||||
logger.warn(String.format("********************************************************************************%n"));
|
||||
logger.warn(String.format("* WARNING:%n"));
|
||||
logger.warn(String.format("*%n"));
|
||||
logger.warn(String.format("* %s%n", msg));
|
||||
logger.warn(String.format("********************************************************************************%n"));
|
||||
logger.warn(String.format("********************************************************************************"));
|
||||
logger.warn(String.format("* WARNING:"));
|
||||
logger.warn(String.format("*"));
|
||||
logger.warn(String.format("* %s", msg));
|
||||
logger.warn(String.format("********************************************************************************"));
|
||||
}
|
||||
|
||||
public static void scareUser(final String msg) {
|
||||
logger.fatal(String.format("********************************************************************************%n"));
|
||||
logger.fatal(String.format("* ERROR:%n"));
|
||||
logger.fatal(String.format("*%n"));
|
||||
logger.fatal(String.format("* %s%n", msg));
|
||||
logger.fatal(String.format("********************************************************************************%n"));
|
||||
logger.fatal(String.format("********************************************************************************"));
|
||||
logger.fatal(String.format("* ERROR:"));
|
||||
logger.fatal(String.format("*"));
|
||||
logger.fatal(String.format("* %s", msg));
|
||||
logger.fatal(String.format("********************************************************************************"));
|
||||
throw new RuntimeException(msg);
|
||||
}
|
||||
|
||||
|
|
@ -232,30 +232,6 @@ public class Utils {
|
|||
return averageDouble(vals, vals.size());
|
||||
}
|
||||
|
||||
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
|
||||
final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary();
|
||||
|
||||
if (seqDict == null) // we couldn't load the reference dictionary
|
||||
return false;
|
||||
|
||||
List<SAMSequenceRecord> refContigs = seqDict.getSequences();
|
||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||
|
||||
if (refContigs != null) {
|
||||
int i = 0;
|
||||
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
|
||||
for (SAMSequenceRecord contig : refContigs) {
|
||||
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
||||
refContigOrdering.put(contig.getSequenceName(), i);
|
||||
i++;
|
||||
}
|
||||
logger.info(String.format("%n Total elements -> %d%n", refContigOrdering.size()));
|
||||
}
|
||||
|
||||
GenomeLoc.setContigOrdering(refContigOrdering);
|
||||
return refContigs != null;
|
||||
}
|
||||
|
||||
// Java Generics can't do primitive types, so I had to do this the simplistic way
|
||||
|
||||
public static Integer[] SortPermutation(final int[] A) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue