Reorganized GenomeLoc code to more clearly and better use the picard SequenceDictionary information.

All GenomeLoc[] are not ArrayList<GenomeLoc> for clarity and consistency
Parsing now recursively merges contiguous elements chr1:1-10;chr1:11-20 => chr1:1-20
Added support for TraversingByLoci over all reference positions specified by the provided location array.  System dynamically determines which traversal system to use.
Pileup now marks, very clearly, reference positions without covered reads.
Made changes around the codebase to deal with new GenomeLoc structure.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@218 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-28 20:37:27 +00:00
parent c2ae6765a3
commit d7c0bcc223
11 changed files with 411 additions and 182 deletions

View File

@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseByReads;
import org.broadinstitute.sting.gatk.traversals.TraverseByLoci;
import org.broadinstitute.sting.gatk.traversals.TraverseByLociByReference;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
@ -187,7 +188,10 @@ public class GenomeAnalysisTK extends CommandLineProgram {
// Try to get the walker specified
try {
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
if ( WALK_ALL_LOCI )
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
else
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
}
catch (java.lang.ClassCastException e) {
// I guess we're a read walker LOL
@ -198,16 +202,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
// Prepare the sort ordering w.r.t. the sequence dictionary
if (REF_FILE_ARG != null) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
int i = 0;
for (SAMSequenceRecord contig : refContigs) {
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
GenomeLoc.setContigOrdering(refContigOrdering);
GenomeLoc.setupRefContigOrdering(refFile);
}
ValidationStringency strictness;
if (STRICTNESS_ARG == null) {
@ -232,6 +227,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
if (INTERVALS_FILE != null) {
engine.setLocationFromFile(INTERVALS_FILE);
}
engine.setSafetyChecking(!UNSAFE);
engine.setSortOnFly(ENABLED_SORT_ON_FLY);
engine.setThreadedIO(ENABLED_THREADED_IO);
@ -277,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
*/
private static void testNewReferenceFeatures(final File refFileName) {
final FastaSequenceFile2 refFile = new FastaSequenceFile2(refFileName);
Utils.setupRefContigOrdering(refFile);
GenomeLoc.setupRefContigOrdering(refFile);
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();

View File

@ -24,14 +24,14 @@ public class SamQueryIterator implements Iterator<SAMRecord> {
Iterator<GenomeLoc> locIter = null;
CloseableIterator<SAMRecord> recordIter = null;
public SamQueryIterator( SAMFileReader reader, GenomeLoc[] locs ) {
public SamQueryIterator( SAMFileReader reader, ArrayList<GenomeLoc> locs ) {
this.reader = reader;
// Our internal contract for the class guarantees that locIter and recordIter are never null.
// Initialize them and seed them with empty data as necessary.
if(locs != null) {
// The user requested a specific set of locations, set up the iterators accordly.
locIter = Arrays.asList(locs).iterator();
locIter = locs.iterator();
recordIter = new NullCloseableIterator<SAMRecord>();
}
else {

View File

@ -93,7 +93,7 @@ public abstract class TraversalEngine {
// Locations we are going to process during the traversal
protected GenomeLoc[] locs = null;
private ArrayList<GenomeLoc> locs = null;
// --------------------------------------------------------------------------------------------------------------
//
@ -175,7 +175,7 @@ public abstract class TraversalEngine {
* @param locStr
*/
public void setLocation(final String locStr) {
this.locs = parseGenomeLocs(locStr);
this.locs = GenomeLoc.parseGenomeLocs(locStr);
}
/**
@ -194,77 +194,19 @@ public abstract class TraversalEngine {
reader.close();
String locStr = Utils.join(";", lines);
logger.debug("locStr: " + locStr);
this.locs = parseGenomeLocs(locStr);
setLocation(locStr);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
}
/**
* Useful utility function that parses a location string into a coordinate-order sorted
* array of GenomeLoc objects
*
* @param str
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
*/
public static GenomeLoc[] parseGenomeLocs(final String str) {
// Of the form: loc1;loc2;...
// Where each locN can be:
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
StdReflect reflect = new JdkStdReflect();
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
Function1<GenomeLoc, String> f1 = parseOne.f1();
try {
Collection<GenomeLoc> result = Functions.map(f1, Arrays.asList(str.split(";")));
GenomeLoc[] locs = (GenomeLoc[]) result.toArray(new GenomeLoc[0]);
Arrays.sort(locs);
logger.info(String.format("Going to process %d locations", locs.length));
//System.out.println(" Locations are: " + Utils.join("\n", Functions.map(Operators.toString, Arrays.asList(locs))));
return locs;
} catch (Exception e) {
logger.fatal(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str));
throw new IllegalArgumentException("Invalid locations string: " + str + ", format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'");
}
}
/**
* A key function that returns true if the proposed GenomeLoc curr is within the list of
* locations we are processing in this TraversalEngine
*
* @param curr
* @return true if we should process GenomeLoc curr, otherwise false
*/
public boolean inLocations(GenomeLoc curr) {
if (this.locs == null) {
return true;
} else {
for ( GenomeLoc loc : this.locs ) {
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
if (loc.overlapsP(curr))
return true;
}
return false;
}
}
public boolean hasLocations() {
return this.locs != null;
}
/**
* Returns true iff we have a specified series of locations to process AND we are past the last
* location in the list. It means that, in a serial processing of the genome, that we are done.
*
* @param curr Current genome Location
* @return true if we are past the last location to process
*/
protected boolean pastFinalLocation(GenomeLoc curr) {
boolean r = locs != null && locs[locs.length - 1].compareTo(curr) == -1 && !locs[locs.length - 1].overlapsP(curr);
//System.out.printf(" pastFinalLocation %s vs. %s => %d => %b%n", locs[locs.length-1], curr, locs[locs.length-1].compareTo( curr ), r);
return r;
}
// --------------------------------------------------------------------------------------------------------------
//
// printing
@ -433,11 +375,11 @@ public abstract class TraversalEngine {
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this.refFile = new FastaSequenceFile2(refFileName); // todo: replace when FastaSequenceFile2 is in picard
this.refIter = new ReferenceIterator(this.refFile);
if (!Utils.setupRefContigOrdering(this.refFile)) {
// We couldn't process the reference contig ordering, fail since we need it
logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
}
// if (!GenomeLoc.setupRefContigOrdering(this.refFile)) {
// // We couldn't process the reference contig ordering, fail since we need it
// logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
// throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
// }
}
}
@ -510,14 +452,14 @@ public abstract class TraversalEngine {
// --------------------------------------------------------------------------------------------------------------
public <M, T> T traverse(Walker<M, T> walker) {
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
if ( hasLocations() )
l = Arrays.asList(locs);
l = locs;
return traverse(walker, l);
}
public <M, T> T traverse(Walker<M, T> walker, List<GenomeLoc> locations) {
public <M, T> T traverse(Walker<M, T> walker, ArrayList<GenomeLoc> locations) {
return null;
}

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.Arrays;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.File;
import net.sf.samtools.SAMRecord;
@ -34,7 +35,7 @@ public class TraverseByLoci extends TraversalEngine {
super(reads, ref, rods);
}
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
if ( walker instanceof LocusWalker ) {
Walker x = walker;
LocusWalker<?, ?> locusWalker = (LocusWalker<?, ?>)x;
@ -54,7 +55,9 @@ public class TraverseByLoci extends TraversalEngine {
* @param <T> ReduceType -- the result of calling reduce() on the walker
* @return 0 on success
*/
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, List<GenomeLoc> locations) {
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
logger.debug("Entering traverseByLoci");
samReader = initializeSAMFile(readsFile);
verifySortOrder(true);
@ -63,9 +66,14 @@ public class TraverseByLoci extends TraversalEngine {
walker.initialize();
T sum = walker.reduceInit();
if ( samReader.hasIndex() && hasLocations() ) {
if ( ! locations.isEmpty() ) {
logger.debug("Doing interval-based traversal");
if ( ! samReader.hasIndex() )
Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting.");
// we are doing interval-based traversals
for ( GenomeLoc interval : locs ) {
for ( GenomeLoc interval : locations ) {
logger.debug(String.format("Processing locus %s", interval.toString()));
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
@ -79,6 +87,7 @@ public class TraverseByLoci extends TraversalEngine {
}
else {
// We aren't locus oriented
logger.debug("Doing non-interval-based traversal");
samReadIter = WrapReadsIterator(getReadsIterator(samReader), true);
sum = carryWalkerOverInterval(walker, samReadIter, sum, null);
}
@ -89,6 +98,8 @@ public class TraverseByLoci extends TraversalEngine {
}
protected <M, T> T carryWalkerOverInterval( LocusWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
logger.debug(String.format("TraverseByLoci.carryWalkerOverInterval Genomic interval is %s", interval));
// prepare the read filtering read iterator and provide it to a new locus iterator
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
@ -102,38 +113,49 @@ public class TraverseByLoci extends TraversalEngine {
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
// location
if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
//System.out.format("Working at %s\n", locus.getLocation().toString());
if ( interval == null || interval.overlapsP(locus.getLocation()) ) {
ReferenceIterator refSite = refIter.seekForward(locus.getLocation());
final char refBase = refSite.getBaseAsChar();
locus.setReferenceContig(refSite.getCurrentContig());
// Iterate forward to get all reference ordered data covering this locus
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation());
logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
walkAtLocus( walker, sum, locus, refSite, rodData );
//
// Execute our contract with the walker. Call filter, map, and reduce
//
final boolean keepMeP = walker.filter(rodData, refBase, locus);
if (keepMeP) {
M x = walker.map(rodData, refBase, locus);
sum = walker.reduce(x, sum);
}
//System.out.format("Working at %s\n", locus.getLocation().toString());
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
done = true;
}
printProgress("loci", locus.getLocation());
if (pastFinalLocation(locus.getLocation()))
done = true;
}
done = interval != null && locus.getLocation().isPast(interval);
//System.out.printf("done is %b, %s vs. %s%n", done, interval, locus.getLocation());
}
return sum;
}
protected <M, T> T walkAtLocus( final LocusWalker<M, T> walker,
T sum,
final LocusContext locus,
final ReferenceIterator refSite,
final List<ReferenceOrderedDatum> rodData ) {
final char refBase = refSite.getBaseAsChar();
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
//
// Execute our contract with the walker. Call filter, map, and reduce
//
final boolean keepMeP = walker.filter(rodData, refBase, locus);
if (keepMeP) {
M x = walker.map(rodData, refBase, locus);
sum = walker.reduce(x, sum);
}
printProgress("loci", locus.getLocation());
return sum;
}
}

View File

@ -0,0 +1,108 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.Arrays;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.File;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import edu.mit.broad.picard.filter.FilteringIterator;
/**
* A simple, short-term solution to iterating over all reference positions over a series of
* genomic locations. Simply overloads the superclass traverse function to go over the entire
* interval's reference positions.
*/
public class TraverseByLociByReference extends TraverseByLoci {
public TraverseByLociByReference(File reads, File ref, List<ReferenceOrderedData> rods) {
super(reads, ref, rods);
}
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
if ( locations.isEmpty() )
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
return super.traverse(walker, locations);
}
protected <M, T> T carryWalkerOverInterval( LocusWalker<M, T> walker,
Iterator<SAMRecord> readIter,
T sum,
GenomeLoc interval ) {
logger.debug(String.format("TraverseByLociByReference.carryWalkerOverInterval Genomic interval is %s", interval));
boolean done = false;
List<SAMRecord> NO_READS = new ArrayList<SAMRecord>();
List<Integer> NO_OFFSETS = new ArrayList<Integer>();
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
LocusIterator locusIter = new LocusIteratorByHanger(filterIter); // prepare the iterator by loci from reads
ReferenceIterator refSite = refIter.seekForward(interval); // jump to the first reference site
LocusContext locusFromReads = advanceReadsToLoc(locusIter, interval); // load up the next locus by reads
// We keep processing while the next reference location is within the interval
while ( interval.containsP(refSite.getLocation()) && ! done ) {
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
this.nRecords++;
GenomeLoc current = refSite.getLocation();
// Iterate forward to get all reference ordered data covering this locus
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, current);
LocusContext locus = null;
if ( locusFromReads != null && locusFromReads.getLocation().compareTo(current) == 0 ) { // we are at the same site
locus = locusFromReads;
if ( locusIter.hasNext() )
locusFromReads = locusIter.next(); // advance the iterator
}
else
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
locus.setReferenceContig(refSite.getCurrentContig());
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
done = true;
}
refSite = refIter.next(); // update our location
}
return sum;
}
private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) {
if ( ! locusIter.hasNext() )
return null;
LocusContext locus = locusIter.next();
while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) {
logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target)));
locus = locusIter.next();
}
logger.debug(String.format(" returning %s", locus.getLocation()));
return locus;
}
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.io.File;
import net.sf.samtools.SAMRecord;
@ -28,7 +29,7 @@ public class TraverseByReads extends TraversalEngine {
super(reads, ref, rods);
}
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
if ( walker instanceof ReadWalker ) {
Walker x = walker;
ReadWalker<?, ?> readWalker = (ReadWalker<?, ?>)x;
@ -46,10 +47,10 @@ public class TraverseByReads extends TraversalEngine {
*
* @param walker A read walker object
* @param <M> MapType -- the result of calling map() on walker
* @param <R> ReduceType -- the result of calling reduce() on the walker
* @param <> ReduceType -- the result of calling reduce() on the walker
* @return 0 on success
*/
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, List<GenomeLoc> locations) {
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
samReadIter = initializeReads();
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
@ -83,7 +84,7 @@ public class TraverseByReads extends TraversalEngine {
locus.setReferenceContig(refSite.getCurrentContig());
}
if (inLocations(loc)) {
if (GenomeLoc.inLocations(loc, locations)) {
//
// execute the walker contact
@ -101,7 +102,7 @@ public class TraverseByReads extends TraversalEngine {
}
printProgress("reads", loc);
if (pastFinalLocation(loc))
if (GenomeLoc.pastFinalLocation(loc, locations))
done = true;
//System.out.printf("Done? %b%n", done);
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import net.sf.samtools.SAMRecord;
import java.util.List;
@ -15,6 +16,8 @@ import java.util.List;
* To change this template use File | Settings | File Templates.
*/
public class PileupWalker extends LocusWalker<Integer, Integer> {
public boolean FLAG_UNCOVERED_BASES = true; // todo: how do I make this a command line argument?
public void initialize() {
}
@ -37,6 +40,10 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
quals += read.getBaseQualityString().charAt(offset);
}
if ( bases.equals("") && FLAG_UNCOVERED_BASES ) {
bases = "*** UNCOVERED SITE ***";
}
String rodString = "";
for ( ReferenceOrderedDatum datum : rodData ) {
if ( datum != null ) {

View File

@ -47,16 +47,7 @@ public class PrepareROD extends CommandLineProgram {
// Prepare the sort ordering w.r.t. the sequence dictionary
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
int i = 0;
for ( SAMSequenceRecord contig : refContigs ) {
System.out.println(contig.getSequenceName());
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
GenomeLoc.setContigOrdering(refContigOrdering);
GenomeLoc.setupRefContigOrdering(refFile);
Class rodClass = Types.get(ROD_TYPE.toLowerCase());

View File

@ -259,19 +259,7 @@ public class IndelInspector extends CommandLineProgram {
setDefaultContigOrdering();
return;
}
List<SAMSequenceRecord> seqs = h.getSequenceDictionary().getSequences();
if ( seqs == null ) {
System.out.println("No reference sequence records found in SAM file header, " +
"falling back to default contig ordering");
setDefaultContigOrdering();
return;
}
int i = 0;
Map<String,Integer> rco = new HashMap<String,Integer>();
for ( SAMSequenceRecord sr : seqs) {
rco.put(sr.getSequenceName(),i++);
}
GenomeLoc.setContigOrdering(rco);
GenomeLoc.setupRefContigOrdering(h.getSequenceDictionary());
}
private void setDefaultContigOrdering() {

View File

@ -1,9 +1,21 @@
package org.broadinstitute.sting.utils;
import net.sf.functionalj.reflect.StdReflect;
import net.sf.functionalj.reflect.JdkStdReflect;
import net.sf.functionalj.FunctionN;
import net.sf.functionalj.Function1;
import net.sf.functionalj.Functions;
import net.sf.functionalj.util.Operators;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -15,23 +27,90 @@ import java.util.regex.Matcher;
*
*/
public class GenomeLoc implements Comparable<GenomeLoc> {
private static Logger logger = Logger.getLogger(GenomeLoc.class);
private String contig;
private long start;
private long stop;
// --------------------------------------------------------------------------------------------------------------
//
// Ugly global variable defining the optional ordering of contig elements
//
public static Map<String, Integer> refContigOrdering = null;
public static HashMap<String, String> interns = null;
// --------------------------------------------------------------------------------------------------------------
//public static Map<String, Integer> refContigOrdering = null;
private static SAMSequenceDictionary contigInfo = null;
private static HashMap<String, String> interns = null;
public static boolean hasKnownContigOrdering() {
return contigInfo != null;
}
public static SAMSequenceRecord getContigInfo( final String contig ) {
return contigInfo.getSequence(contig);
}
public static int getContigIndex( final String contig ) {
return contigInfo.getSequenceIndex(contig);
}
/*
public static void setContigOrdering(Map<String, Integer> rco) {
refContigOrdering = rco;
interns = new HashMap<String, String>();
for ( String contig : rco.keySet() )
interns.put( contig, contig );
}
*/
/*
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary();
if (seqDict == null) // we couldn't load the reference dictionary
return false;
List<SAMSequenceRecord> refContigs = seqDict.getSequences();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
if (refContigs != null) {
int i = 0;
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
for (SAMSequenceRecord contig : refContigs) {
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
}
setContigOrdering(refContigOrdering);
return refContigs != null;
}
*/
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
return setupRefContigOrdering(refFile.getSequenceDictionary());
}
public static boolean setupRefContigOrdering(final SAMSequenceDictionary seqDict) {
if (seqDict == null) // we couldn't load the reference dictionary
return false;
else {
contigInfo = seqDict;
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
for (SAMSequenceRecord contig : seqDict.getSequences() ) {
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
}
}
return true;
}
// --------------------------------------------------------------------------------------------------------------
//
// constructors
//
// --------------------------------------------------------------------------------------------------------------
public GenomeLoc( String contig, final long start, final long stop ) {
if ( interns != null )
contig = interns.get(contig);
@ -49,9 +128,11 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
}
// --------------------------------------------------------------------------------------------------------------
//
// Parsing string representations
//
// --------------------------------------------------------------------------------------------------------------
private static long parsePosition( final String pos ) {
String x = pos.replaceAll(",", "");
return Long.parseLong(x);
@ -101,12 +182,104 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
throw new RuntimeException("Invalid Genome Location string: " + str);
}
if ( stop == Integer.MAX_VALUE && hasKnownContigOrdering() ) {
// lookup the actually stop position!
stop = getContigInfo(contig).getSequenceLength();
}
GenomeLoc loc = new GenomeLoc(contig, start, stop);
//System.out.printf(" => Parsed location '%s' into %s%n", str, loc);
return loc;
}
/**
* Useful utility function that parses a location string into a coordinate-order sorted
* array of GenomeLoc objects
*
* @param str
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
*/
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
// Of the form: loc1;loc2;...
// Where each locN can be:
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
StdReflect reflect = new JdkStdReflect();
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
Function1<GenomeLoc, String> f1 = parseOne.f1();
try {
Collection<GenomeLoc> result = Functions.map(f1, Arrays.asList(str.split(";")));
ArrayList<GenomeLoc> locs = new ArrayList(result);
Collections.sort(locs);
//logger.info(String.format("Going to process %d locations", locs.length));
locs = mergeOverlappingLocations(locs);
logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs)));
return locs;
} catch (Exception e) {
e.printStackTrace();
Utils.scareUser(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str));
return null;
}
}
public static ArrayList<GenomeLoc> mergeOverlappingLocations(final ArrayList<GenomeLoc> raw) {
logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw)));
if ( raw.size() <= 1 )
return raw;
else {
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
Iterator<GenomeLoc> it = raw.iterator();
GenomeLoc prev = it.next();
while ( it.hasNext() ) {
GenomeLoc curr = it.next();
if ( prev.contiguousP(curr) ) {
prev = prev.merge(curr);
} else {
merged.add(prev);
prev = curr;
}
}
merged.add(prev);
return merged;
}
}
/**
* Returns true iff we have a specified series of locations to process AND we are past the last
* location in the list. It means that, in a serial processing of the genome, that we are done.
*
* @param curr Current genome Location
* @return true if we are past the last location to process
*/
public static boolean pastFinalLocation(GenomeLoc curr, ArrayList<GenomeLoc> locs) {
if ( locs == null )
return false;
else {
GenomeLoc last = locs.get(locs.size() - 1);
return last.compareTo(curr) == -1 && ! last.overlapsP(curr);
}
}
/**
* A key function that returns true if the proposed GenomeLoc curr is within the list of
* locations we are processing in this TraversalEngine
*
* @param curr
* @return true if we should process GenomeLoc curr, otherwise false
*/
public static boolean inLocations(GenomeLoc curr, ArrayList<GenomeLoc> locs) {
if (locs == null) {
return true;
} else {
for ( GenomeLoc loc : locs ) {
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
if (loc.overlapsP(curr))
return true;
}
return false;
}
}
//
// Accessors and setters
//
@ -147,10 +320,34 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return false;
}
public final boolean discontinuousP(GenomeLoc that) {
if ( compareContigs(this.contig, that.contig) != 0 ) return true; // different chromosomes
if ( (this.start - 1) > that.stop ) return true; // this guy is past that
if ( (that.start - 1) > this.stop ) return true; // that guy is past our start
return false;
}
public final boolean overlapsP(GenomeLoc that) {
return ! disjointP( that );
}
public final boolean contiguousP(GenomeLoc that) {
return ! discontinuousP( that );
}
public GenomeLoc merge( GenomeLoc that ) {
assert this.contiguousP(that);
return new GenomeLoc(getContig(),
Math.min(getStart(), that.getStart()),
Math.max( getStop(), that.getStop()) );
}
public final boolean containsP(GenomeLoc that) {
if ( ! onSameContig(that) ) return false;
return getStart() <= that.getStart() && getStop() >= that.getStop();
}
public final boolean onSameContig(GenomeLoc that) {
return this.contig.equals(that.contig);
}
@ -170,6 +367,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
}
public final boolean isPast( GenomeLoc that ) {
return this.compareContigs(that) == 1 || this.getStart() > that.getStop();
}
public final void incPos() {
incPos(1);
}
@ -194,14 +395,17 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return 0;
}
assert refContigOrdering.containsKey(thisContig);// : this;
assert refContigOrdering.containsKey(thatContig);// : that;
//assert getContigIndex(thisContig) != -1;// : this;
//assert getContigIndex(thatContig) != -1;// : that;
if ( refContigOrdering != null )
if ( hasKnownContigOrdering() )
{
if ( ! refContigOrdering.containsKey(thisContig) )
int thisIndex = getContigIndex(thisContig);
int thatIndex = getContigIndex(thatContig);
if ( thisIndex == -1 )
{
if ( ! refContigOrdering.containsKey(thatContig) )
if ( thatIndex == -1 )
{
// Use regular sorted order
return thisContig.compareTo(thatContig);
@ -212,20 +416,14 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return 1;
}
}
else if ( ! refContigOrdering.containsKey(thatContig) )
else if ( thatIndex == -1 )
{
return -1;
}
else
{
assert refContigOrdering.containsKey(thisContig);// : this;
assert refContigOrdering.containsKey(thatContig);// : that;
final int thisO = refContigOrdering.get(thisContig);
final int thatO = refContigOrdering.get(thatContig);
if ( thisO < thatO ) return -1;
if ( thisO > thatO ) return 1;
if ( thisIndex < thatIndex ) return -1;
if ( thisIndex > thatIndex ) return 1;
return 0;
}
}

View File

@ -27,19 +27,19 @@ public class Utils {
private static Logger logger = Logger.getLogger(FileProgressTracker.class);
public static void warnUser(final String msg) {
logger.warn(String.format("********************************************************************************%n"));
logger.warn(String.format("* WARNING:%n"));
logger.warn(String.format("*%n"));
logger.warn(String.format("* %s%n", msg));
logger.warn(String.format("********************************************************************************%n"));
logger.warn(String.format("********************************************************************************"));
logger.warn(String.format("* WARNING:"));
logger.warn(String.format("*"));
logger.warn(String.format("* %s", msg));
logger.warn(String.format("********************************************************************************"));
}
public static void scareUser(final String msg) {
logger.fatal(String.format("********************************************************************************%n"));
logger.fatal(String.format("* ERROR:%n"));
logger.fatal(String.format("*%n"));
logger.fatal(String.format("* %s%n", msg));
logger.fatal(String.format("********************************************************************************%n"));
logger.fatal(String.format("********************************************************************************"));
logger.fatal(String.format("* ERROR:"));
logger.fatal(String.format("*"));
logger.fatal(String.format("* %s", msg));
logger.fatal(String.format("********************************************************************************"));
throw new RuntimeException(msg);
}
@ -232,30 +232,6 @@ public class Utils {
return averageDouble(vals, vals.size());
}
public static boolean setupRefContigOrdering(final ReferenceSequenceFile refFile) {
final SAMSequenceDictionary seqDict = refFile.getSequenceDictionary();
if (seqDict == null) // we couldn't load the reference dictionary
return false;
List<SAMSequenceRecord> refContigs = seqDict.getSequences();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
if (refContigs != null) {
int i = 0;
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
for (SAMSequenceRecord contig : refContigs) {
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
logger.info(String.format("%n Total elements -> %d%n", refContigOrdering.size()));
}
GenomeLoc.setContigOrdering(refContigOrdering);
return refContigs != null;
}
// Java Generics can't do primitive types, so I had to do this the simplistic way
public static Integer[] SortPermutation(final int[] A) {