1. modifed by read traversals with indexes to be more general

2. GenomeLocs for reads should have ends spanning the read
   (moved it to GenomeLoc from Utils)
3. Got rid of those stupid unmappable characters from comments in various files


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@289 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-04-03 18:24:08 +00:00
parent 86fc18e9fc
commit 42eb356782
9 changed files with 52 additions and 38 deletions

View File

@ -60,6 +60,6 @@ public class ReadDatum implements Datum {
* @return a genome loc that details the region that our read spans.
*/
public GenomeLoc getSequenceLocation() {
return Utils.genomicLocationOf(sam);
return GenomeLoc.genomicLocationOf(sam);
}
}

View File

@ -66,7 +66,7 @@ public class ReadShard implements DataShard {
final List<SAMRecord> reads = Arrays.asList(read);
// put together the genome location
final GenomeLoc loc = Utils.genomicLocationOf(read);
final GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
// Offset of a single read is always 0
List<Integer> offsets = Arrays.asList(0);

View File

@ -155,7 +155,7 @@ public class LocusIteratorByHanger extends LocusIterator {
return true;
else {
final SAMRecord read = it.peek();
GenomeLoc readLoc = Utils.genomicLocationOf(read);
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
final boolean coveredP = currentPositionIsFullyCovered(readLoc);
//System.out.printf("CoverP = %s => %b%n", readLoc, coveredP);
return coveredP;
@ -177,7 +177,7 @@ public class LocusIteratorByHanger extends LocusIterator {
SAMRecord read = it.next();
justCleared = false;
GenomeLoc readLoc = Utils.genomicLocationOf(read);
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
if ( DEBUG ) {
logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n",
readHanger.size(), incrementSize,

View File

@ -53,8 +53,8 @@ public class SortSamIterator implements Iterator<SAMRecord> {
}
public int compareTo(ComparableSAMRecord o) {
GenomeLoc myLoc = Utils.genomicLocationOf(record);
GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord());
GenomeLoc myLoc = GenomeLoc.genomicLocationOf(record);
GenomeLoc hisLoc = GenomeLoc.genomicLocationOf(o.getRecord());
return myLoc.compareTo(hisLoc);
}
}

View File

@ -55,8 +55,8 @@ public class VerifyingSamIterator implements Iterator<SAMRecord> {
if ( last == null || cur.getReadUnmappedFlag() )
return false;
else {
GenomeLoc lastLoc = Utils.genomicLocationOf( last );
GenomeLoc curLoc = Utils.genomicLocationOf( cur );
GenomeLoc lastLoc = GenomeLoc.genomicLocationOf( last );
GenomeLoc curLoc = GenomeLoc.genomicLocationOf( cur );
return curLoc.compareTo(lastLoc) == -1;
}
}

View File

@ -188,7 +188,7 @@ public abstract class TraversalEngine {
* regions specified by the location string. The string is of the form:
* Of the form: loc1;loc2;...
* Where each locN can be:
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
*
* @param locStr
*/
@ -201,7 +201,7 @@ public abstract class TraversalEngine {
* regions specified by the location string. The string is of the form:
* Of the form: loc1;loc2;...
* Where each locN can be:
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
*
* @param file_name
*/

View File

@ -8,12 +8,12 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.LinkedList;
import java.io.File;
import net.sf.samtools.SAMRecord;
@ -54,7 +54,8 @@ public class TraverseByReads extends TraversalEngine {
*/
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
samReadIter = initializeReads();
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
if ( refFileName != null && !locations.isEmpty() )
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing."));
@ -72,13 +73,15 @@ public class TraverseByReads extends TraversalEngine {
List<Integer> offsets = Arrays.asList(0); // Offset of a single read is always 0
boolean done = false;
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
LinkedList notYetTraversedLocations = new LinkedList(locations);
while (samReadIter.hasNext() && !done) {
this.nRecords++;
// get the next read
final SAMRecord read = samReadIter.next();
final List<SAMRecord> reads = Arrays.asList(read);
GenomeLoc loc = Utils.genomicLocationOf(read);
GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
// Jump forward in the reference to this locus location
LocusContext locus = new LocusContext(loc, reads, offsets);
@ -87,7 +90,8 @@ public class TraverseByReads extends TraversalEngine {
locus.setReferenceContig(refSite.getCurrentContig());
}
if (GenomeLoc.inLocations(loc, locations)) {
GenomeLoc.removePastLocs(loc, notYetTraversedLocations);
if (GenomeLoc.overlapswithSortedLocsP(loc, notYetTraversedLocations, locations.isEmpty())) {
//
// execute the walker contact

View File

@ -8,6 +8,7 @@ import net.sf.functionalj.Functions;
import net.sf.functionalj.util.Operators;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMRecord;
import java.util.*;
import java.util.regex.Pattern;
@ -41,7 +42,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
//public static Map<String, Integer> refContigOrdering = null;
private static SAMSequenceDictionary contigInfo = null;
private static HashMap<String, String> interns = null;
private static int lastGoodIntervalIndex = 0;
public static boolean hasKnownContigOrdering() {
return contigInfo != null;
@ -129,6 +129,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
}
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart(), read.getAlignmentEnd());
}
// --------------------------------------------------------------------------------------------------------------
//
// Parsing string representations
@ -140,7 +144,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
}
public static GenomeLoc parseGenomeLoc( final String str ) {
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
@ -204,7 +208,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
// Of the form: loc1;loc2;...
// Where each locN can be:
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
StdReflect reflect = new JdkStdReflect();
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
Function1<GenomeLoc, String> f1 = parseOne.f1();
@ -272,29 +276,40 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
if ( locs.size() == 0 ) {
return true;
} else {
for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) {
GenomeLoc loc = locs.get(i);
// since it's ordered, we can do some simple checks to save us tons of time
if ( hasKnownContigOrdering() ) {
int curIndex = getContigIndex(curr.contig);
int locIndex = getContigIndex(loc.contig);
// skip loci before intervals begin
if (curIndex < locIndex)
return false;
// skip loci between intervals
if (curIndex == locIndex && curr.stop < loc.start)
return false;
}
for ( GenomeLoc loc : locs ) {
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
if (loc.overlapsP(curr)) {
lastGoodIntervalIndex = i;
if (loc.overlapsP(curr))
return true;
}
}
return false;
}
}
public static void removePastLocs(GenomeLoc curr, List<GenomeLoc> locs) {
while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) {
//System.out.println("At: " + curr + ", removing: " + locs.get(0));
locs.remove(0);
}
}
public static boolean overlapswithSortedLocsP(GenomeLoc curr, List<GenomeLoc> locs, boolean returnTrueIfEmpty) {
if ( locs.isEmpty() )
return returnTrueIfEmpty;
// skip loci before intervals begin
if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) )
return false;
for ( GenomeLoc loc : locs ) {
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
if ( loc.overlapsP(curr) )
return true;
if ( curr.compareTo(loc) < 0 )
return false;
}
return false;
}
//
// Accessors and setters
//
@ -418,7 +433,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
int thisIndex = getContigIndex(thisContig);
int thatIndex = getContigIndex(thatContig);
if ( thisIndex == -1 )
{
if ( thatIndex == -1 )

View File

@ -180,10 +180,6 @@ public class Utils {
return new String(basesAsbytes);
}
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart());
}
private static final Map<Integer, String> readFlagNames
= new HashMap<Integer, String>();