1. modifed by read traversals with indexes to be more general
2. GenomeLocs for reads should have ends spanning the read (moved it to GenomeLoc from Utils) 3. Got rid of those stupid unmappable characters from comments in various files git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@289 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
86fc18e9fc
commit
42eb356782
|
|
@ -60,6 +60,6 @@ public class ReadDatum implements Datum {
|
|||
* @return a genome loc that details the region that our read spans.
|
||||
*/
|
||||
public GenomeLoc getSequenceLocation() {
|
||||
return Utils.genomicLocationOf(sam);
|
||||
return GenomeLoc.genomicLocationOf(sam);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ public class ReadShard implements DataShard {
|
|||
final List<SAMRecord> reads = Arrays.asList(read);
|
||||
|
||||
// put together the genome location
|
||||
final GenomeLoc loc = Utils.genomicLocationOf(read);
|
||||
final GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
|
||||
|
||||
// Offset of a single read is always 0
|
||||
List<Integer> offsets = Arrays.asList(0);
|
||||
|
|
|
|||
|
|
@ -155,7 +155,7 @@ public class LocusIteratorByHanger extends LocusIterator {
|
|||
return true;
|
||||
else {
|
||||
final SAMRecord read = it.peek();
|
||||
GenomeLoc readLoc = Utils.genomicLocationOf(read);
|
||||
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
|
||||
final boolean coveredP = currentPositionIsFullyCovered(readLoc);
|
||||
//System.out.printf("CoverP = %s => %b%n", readLoc, coveredP);
|
||||
return coveredP;
|
||||
|
|
@ -177,7 +177,7 @@ public class LocusIteratorByHanger extends LocusIterator {
|
|||
SAMRecord read = it.next();
|
||||
justCleared = false;
|
||||
|
||||
GenomeLoc readLoc = Utils.genomicLocationOf(read);
|
||||
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
|
||||
if ( DEBUG ) {
|
||||
logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n",
|
||||
readHanger.size(), incrementSize,
|
||||
|
|
|
|||
|
|
@ -53,8 +53,8 @@ public class SortSamIterator implements Iterator<SAMRecord> {
|
|||
}
|
||||
|
||||
public int compareTo(ComparableSAMRecord o) {
|
||||
GenomeLoc myLoc = Utils.genomicLocationOf(record);
|
||||
GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord());
|
||||
GenomeLoc myLoc = GenomeLoc.genomicLocationOf(record);
|
||||
GenomeLoc hisLoc = GenomeLoc.genomicLocationOf(o.getRecord());
|
||||
return myLoc.compareTo(hisLoc);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,8 +55,8 @@ public class VerifyingSamIterator implements Iterator<SAMRecord> {
|
|||
if ( last == null || cur.getReadUnmappedFlag() )
|
||||
return false;
|
||||
else {
|
||||
GenomeLoc lastLoc = Utils.genomicLocationOf( last );
|
||||
GenomeLoc curLoc = Utils.genomicLocationOf( cur );
|
||||
GenomeLoc lastLoc = GenomeLoc.genomicLocationOf( last );
|
||||
GenomeLoc curLoc = GenomeLoc.genomicLocationOf( cur );
|
||||
return curLoc.compareTo(lastLoc) == -1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ public abstract class TraversalEngine {
|
|||
* regions specified by the location string. The string is of the form:
|
||||
* Of the form: loc1;loc2;...
|
||||
* Where each locN can be:
|
||||
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
*
|
||||
* @param locStr
|
||||
*/
|
||||
|
|
@ -201,7 +201,7 @@ public abstract class TraversalEngine {
|
|||
* regions specified by the location string. The string is of the form:
|
||||
* Of the form: loc1;loc2;...
|
||||
* Where each locN can be:
|
||||
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
*
|
||||
* @param file_name
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -8,12 +8,12 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
|
@ -54,7 +54,8 @@ public class TraverseByReads extends TraversalEngine {
|
|||
*/
|
||||
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
samReadIter = initializeReads();
|
||||
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
|
||||
if ( refFileName != null && !locations.isEmpty() )
|
||||
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
|
||||
|
||||
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
|
||||
logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing."));
|
||||
|
|
@ -72,13 +73,15 @@ public class TraverseByReads extends TraversalEngine {
|
|||
List<Integer> offsets = Arrays.asList(0); // Offset of a single read is always 0
|
||||
|
||||
boolean done = false;
|
||||
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
|
||||
LinkedList notYetTraversedLocations = new LinkedList(locations);
|
||||
while (samReadIter.hasNext() && !done) {
|
||||
this.nRecords++;
|
||||
|
||||
// get the next read
|
||||
final SAMRecord read = samReadIter.next();
|
||||
final List<SAMRecord> reads = Arrays.asList(read);
|
||||
GenomeLoc loc = Utils.genomicLocationOf(read);
|
||||
GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
LocusContext locus = new LocusContext(loc, reads, offsets);
|
||||
|
|
@ -87,7 +90,8 @@ public class TraverseByReads extends TraversalEngine {
|
|||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
}
|
||||
|
||||
if (GenomeLoc.inLocations(loc, locations)) {
|
||||
GenomeLoc.removePastLocs(loc, notYetTraversedLocations);
|
||||
if (GenomeLoc.overlapswithSortedLocsP(loc, notYetTraversedLocations, locations.isEmpty())) {
|
||||
|
||||
//
|
||||
// execute the walker contact
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import net.sf.functionalj.Functions;
|
|||
import net.sf.functionalj.util.Operators;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
|
@ -41,7 +42,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
//public static Map<String, Integer> refContigOrdering = null;
|
||||
private static SAMSequenceDictionary contigInfo = null;
|
||||
private static HashMap<String, String> interns = null;
|
||||
private static int lastGoodIntervalIndex = 0;
|
||||
|
||||
public static boolean hasKnownContigOrdering() {
|
||||
return contigInfo != null;
|
||||
|
|
@ -129,6 +129,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
|
||||
}
|
||||
|
||||
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
|
||||
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Parsing string representations
|
||||
|
|
@ -140,7 +144,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
}
|
||||
|
||||
public static GenomeLoc parseGenomeLoc( final String str ) {
|
||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
//System.out.printf("Parsing location '%s'%n", str);
|
||||
|
||||
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
|
||||
|
|
@ -204,7 +208,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
||||
// Of the form: loc1;loc2;...
|
||||
// Where each locN can be:
|
||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
StdReflect reflect = new JdkStdReflect();
|
||||
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
|
||||
Function1<GenomeLoc, String> f1 = parseOne.f1();
|
||||
|
|
@ -272,29 +276,40 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
if ( locs.size() == 0 ) {
|
||||
return true;
|
||||
} else {
|
||||
for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) {
|
||||
GenomeLoc loc = locs.get(i);
|
||||
// since it's ordered, we can do some simple checks to save us tons of time
|
||||
if ( hasKnownContigOrdering() ) {
|
||||
int curIndex = getContigIndex(curr.contig);
|
||||
int locIndex = getContigIndex(loc.contig);
|
||||
// skip loci before intervals begin
|
||||
if (curIndex < locIndex)
|
||||
return false;
|
||||
// skip loci between intervals
|
||||
if (curIndex == locIndex && curr.stop < loc.start)
|
||||
return false;
|
||||
}
|
||||
for ( GenomeLoc loc : locs ) {
|
||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||
if (loc.overlapsP(curr)) {
|
||||
lastGoodIntervalIndex = i;
|
||||
if (loc.overlapsP(curr))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static void removePastLocs(GenomeLoc curr, List<GenomeLoc> locs) {
|
||||
while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) {
|
||||
//System.out.println("At: " + curr + ", removing: " + locs.get(0));
|
||||
locs.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean overlapswithSortedLocsP(GenomeLoc curr, List<GenomeLoc> locs, boolean returnTrueIfEmpty) {
|
||||
if ( locs.isEmpty() )
|
||||
return returnTrueIfEmpty;
|
||||
|
||||
// skip loci before intervals begin
|
||||
if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) )
|
||||
return false;
|
||||
|
||||
for ( GenomeLoc loc : locs ) {
|
||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||
if ( loc.overlapsP(curr) )
|
||||
return true;
|
||||
if ( curr.compareTo(loc) < 0 )
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Accessors and setters
|
||||
//
|
||||
|
|
@ -418,7 +433,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
int thisIndex = getContigIndex(thisContig);
|
||||
int thatIndex = getContigIndex(thatContig);
|
||||
|
||||
|
||||
if ( thisIndex == -1 )
|
||||
{
|
||||
if ( thatIndex == -1 )
|
||||
|
|
|
|||
|
|
@ -180,10 +180,6 @@ public class Utils {
|
|||
return new String(basesAsbytes);
|
||||
}
|
||||
|
||||
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
|
||||
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart());
|
||||
}
|
||||
|
||||
private static final Map<Integer, String> readFlagNames
|
||||
= new HashMap<Integer, String>();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue