1. modifed by read traversals with indexes to be more general
2. GenomeLocs for reads should have ends spanning the read (moved it to GenomeLoc from Utils) 3. Got rid of those stupid unmappable characters from comments in various files git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@289 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
86fc18e9fc
commit
42eb356782
|
|
@ -60,6 +60,6 @@ public class ReadDatum implements Datum {
|
||||||
* @return a genome loc that details the region that our read spans.
|
* @return a genome loc that details the region that our read spans.
|
||||||
*/
|
*/
|
||||||
public GenomeLoc getSequenceLocation() {
|
public GenomeLoc getSequenceLocation() {
|
||||||
return Utils.genomicLocationOf(sam);
|
return GenomeLoc.genomicLocationOf(sam);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,7 @@ public class ReadShard implements DataShard {
|
||||||
final List<SAMRecord> reads = Arrays.asList(read);
|
final List<SAMRecord> reads = Arrays.asList(read);
|
||||||
|
|
||||||
// put together the genome location
|
// put together the genome location
|
||||||
final GenomeLoc loc = Utils.genomicLocationOf(read);
|
final GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
|
||||||
|
|
||||||
// Offset of a single read is always 0
|
// Offset of a single read is always 0
|
||||||
List<Integer> offsets = Arrays.asList(0);
|
List<Integer> offsets = Arrays.asList(0);
|
||||||
|
|
|
||||||
|
|
@ -155,7 +155,7 @@ public class LocusIteratorByHanger extends LocusIterator {
|
||||||
return true;
|
return true;
|
||||||
else {
|
else {
|
||||||
final SAMRecord read = it.peek();
|
final SAMRecord read = it.peek();
|
||||||
GenomeLoc readLoc = Utils.genomicLocationOf(read);
|
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
|
||||||
final boolean coveredP = currentPositionIsFullyCovered(readLoc);
|
final boolean coveredP = currentPositionIsFullyCovered(readLoc);
|
||||||
//System.out.printf("CoverP = %s => %b%n", readLoc, coveredP);
|
//System.out.printf("CoverP = %s => %b%n", readLoc, coveredP);
|
||||||
return coveredP;
|
return coveredP;
|
||||||
|
|
@ -177,7 +177,7 @@ public class LocusIteratorByHanger extends LocusIterator {
|
||||||
SAMRecord read = it.next();
|
SAMRecord read = it.next();
|
||||||
justCleared = false;
|
justCleared = false;
|
||||||
|
|
||||||
GenomeLoc readLoc = Utils.genomicLocationOf(read);
|
GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read);
|
||||||
if ( DEBUG ) {
|
if ( DEBUG ) {
|
||||||
logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n",
|
logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n",
|
||||||
readHanger.size(), incrementSize,
|
readHanger.size(), incrementSize,
|
||||||
|
|
|
||||||
|
|
@ -53,8 +53,8 @@ public class SortSamIterator implements Iterator<SAMRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int compareTo(ComparableSAMRecord o) {
|
public int compareTo(ComparableSAMRecord o) {
|
||||||
GenomeLoc myLoc = Utils.genomicLocationOf(record);
|
GenomeLoc myLoc = GenomeLoc.genomicLocationOf(record);
|
||||||
GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord());
|
GenomeLoc hisLoc = GenomeLoc.genomicLocationOf(o.getRecord());
|
||||||
return myLoc.compareTo(hisLoc);
|
return myLoc.compareTo(hisLoc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,8 +55,8 @@ public class VerifyingSamIterator implements Iterator<SAMRecord> {
|
||||||
if ( last == null || cur.getReadUnmappedFlag() )
|
if ( last == null || cur.getReadUnmappedFlag() )
|
||||||
return false;
|
return false;
|
||||||
else {
|
else {
|
||||||
GenomeLoc lastLoc = Utils.genomicLocationOf( last );
|
GenomeLoc lastLoc = GenomeLoc.genomicLocationOf( last );
|
||||||
GenomeLoc curLoc = Utils.genomicLocationOf( cur );
|
GenomeLoc curLoc = GenomeLoc.genomicLocationOf( cur );
|
||||||
return curLoc.compareTo(lastLoc) == -1;
|
return curLoc.compareTo(lastLoc) == -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -188,7 +188,7 @@ public abstract class TraversalEngine {
|
||||||
* regions specified by the location string. The string is of the form:
|
* regions specified by the location string. The string is of the form:
|
||||||
* Of the form: loc1;loc2;...
|
* Of the form: loc1;loc2;...
|
||||||
* Where each locN can be:
|
* Where each locN can be:
|
||||||
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
*
|
*
|
||||||
* @param locStr
|
* @param locStr
|
||||||
*/
|
*/
|
||||||
|
|
@ -201,7 +201,7 @@ public abstract class TraversalEngine {
|
||||||
* regions specified by the location string. The string is of the form:
|
* regions specified by the location string. The string is of the form:
|
||||||
* Of the form: loc1;loc2;...
|
* Of the form: loc1;loc2;...
|
||||||
* Where each locN can be:
|
* Where each locN can be:
|
||||||
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
*
|
*
|
||||||
* @param file_name
|
* @param file_name
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -8,12 +8,12 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
@ -54,7 +54,8 @@ public class TraverseByReads extends TraversalEngine {
|
||||||
*/
|
*/
|
||||||
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
public <M, T> Object traverseByRead(ReadWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||||
samReadIter = initializeReads();
|
samReadIter = initializeReads();
|
||||||
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
|
if ( refFileName != null && !locations.isEmpty() )
|
||||||
|
GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName));
|
||||||
|
|
||||||
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
|
if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) {
|
||||||
logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing."));
|
logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing."));
|
||||||
|
|
@ -72,13 +73,15 @@ public class TraverseByReads extends TraversalEngine {
|
||||||
List<Integer> offsets = Arrays.asList(0); // Offset of a single read is always 0
|
List<Integer> offsets = Arrays.asList(0); // Offset of a single read is always 0
|
||||||
|
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
|
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
|
||||||
|
LinkedList notYetTraversedLocations = new LinkedList(locations);
|
||||||
while (samReadIter.hasNext() && !done) {
|
while (samReadIter.hasNext() && !done) {
|
||||||
this.nRecords++;
|
this.nRecords++;
|
||||||
|
|
||||||
// get the next read
|
// get the next read
|
||||||
final SAMRecord read = samReadIter.next();
|
final SAMRecord read = samReadIter.next();
|
||||||
final List<SAMRecord> reads = Arrays.asList(read);
|
final List<SAMRecord> reads = Arrays.asList(read);
|
||||||
GenomeLoc loc = Utils.genomicLocationOf(read);
|
GenomeLoc loc = GenomeLoc.genomicLocationOf(read);
|
||||||
|
|
||||||
// Jump forward in the reference to this locus location
|
// Jump forward in the reference to this locus location
|
||||||
LocusContext locus = new LocusContext(loc, reads, offsets);
|
LocusContext locus = new LocusContext(loc, reads, offsets);
|
||||||
|
|
@ -87,7 +90,8 @@ public class TraverseByReads extends TraversalEngine {
|
||||||
locus.setReferenceContig(refSite.getCurrentContig());
|
locus.setReferenceContig(refSite.getCurrentContig());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (GenomeLoc.inLocations(loc, locations)) {
|
GenomeLoc.removePastLocs(loc, notYetTraversedLocations);
|
||||||
|
if (GenomeLoc.overlapswithSortedLocsP(loc, notYetTraversedLocations, locations.isEmpty())) {
|
||||||
|
|
||||||
//
|
//
|
||||||
// execute the walker contact
|
// execute the walker contact
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import net.sf.functionalj.Functions;
|
||||||
import net.sf.functionalj.util.Operators;
|
import net.sf.functionalj.util.Operators;
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
@ -41,7 +42,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
//public static Map<String, Integer> refContigOrdering = null;
|
//public static Map<String, Integer> refContigOrdering = null;
|
||||||
private static SAMSequenceDictionary contigInfo = null;
|
private static SAMSequenceDictionary contigInfo = null;
|
||||||
private static HashMap<String, String> interns = null;
|
private static HashMap<String, String> interns = null;
|
||||||
private static int lastGoodIntervalIndex = 0;
|
|
||||||
|
|
||||||
public static boolean hasKnownContigOrdering() {
|
public static boolean hasKnownContigOrdering() {
|
||||||
return contigInfo != null;
|
return contigInfo != null;
|
||||||
|
|
@ -129,6 +129,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
|
this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
|
||||||
|
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Parsing string representations
|
// Parsing string representations
|
||||||
|
|
@ -140,7 +144,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GenomeLoc parseGenomeLoc( final String str ) {
|
public static GenomeLoc parseGenomeLoc( final String str ) {
|
||||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
//System.out.printf("Parsing location '%s'%n", str);
|
//System.out.printf("Parsing location '%s'%n", str);
|
||||||
|
|
||||||
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
|
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
|
||||||
|
|
@ -204,7 +208,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
||||||
// Of the form: loc1;loc2;...
|
// Of the form: loc1;loc2;...
|
||||||
// Where each locN can be:
|
// Where each locN can be:
|
||||||
// Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
|
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
StdReflect reflect = new JdkStdReflect();
|
StdReflect reflect = new JdkStdReflect();
|
||||||
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
|
FunctionN<GenomeLoc> parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class);
|
||||||
Function1<GenomeLoc, String> f1 = parseOne.f1();
|
Function1<GenomeLoc, String> f1 = parseOne.f1();
|
||||||
|
|
@ -272,29 +276,40 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
if ( locs.size() == 0 ) {
|
if ( locs.size() == 0 ) {
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) {
|
for ( GenomeLoc loc : locs ) {
|
||||||
GenomeLoc loc = locs.get(i);
|
|
||||||
// since it's ordered, we can do some simple checks to save us tons of time
|
|
||||||
if ( hasKnownContigOrdering() ) {
|
|
||||||
int curIndex = getContigIndex(curr.contig);
|
|
||||||
int locIndex = getContigIndex(loc.contig);
|
|
||||||
// skip loci before intervals begin
|
|
||||||
if (curIndex < locIndex)
|
|
||||||
return false;
|
|
||||||
// skip loci between intervals
|
|
||||||
if (curIndex == locIndex && curr.stop < loc.start)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||||
if (loc.overlapsP(curr)) {
|
if (loc.overlapsP(curr))
|
||||||
lastGoodIntervalIndex = i;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void removePastLocs(GenomeLoc curr, List<GenomeLoc> locs) {
|
||||||
|
while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) {
|
||||||
|
//System.out.println("At: " + curr + ", removing: " + locs.get(0));
|
||||||
|
locs.remove(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean overlapswithSortedLocsP(GenomeLoc curr, List<GenomeLoc> locs, boolean returnTrueIfEmpty) {
|
||||||
|
if ( locs.isEmpty() )
|
||||||
|
return returnTrueIfEmpty;
|
||||||
|
|
||||||
|
// skip loci before intervals begin
|
||||||
|
if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for ( GenomeLoc loc : locs ) {
|
||||||
|
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||||
|
if ( loc.overlapsP(curr) )
|
||||||
|
return true;
|
||||||
|
if ( curr.compareTo(loc) < 0 )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Accessors and setters
|
// Accessors and setters
|
||||||
//
|
//
|
||||||
|
|
@ -418,7 +433,6 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
int thisIndex = getContigIndex(thisContig);
|
int thisIndex = getContigIndex(thisContig);
|
||||||
int thatIndex = getContigIndex(thatContig);
|
int thatIndex = getContigIndex(thatContig);
|
||||||
|
|
||||||
|
|
||||||
if ( thisIndex == -1 )
|
if ( thisIndex == -1 )
|
||||||
{
|
{
|
||||||
if ( thatIndex == -1 )
|
if ( thatIndex == -1 )
|
||||||
|
|
|
||||||
|
|
@ -180,10 +180,6 @@ public class Utils {
|
||||||
return new String(basesAsbytes);
|
return new String(basesAsbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GenomeLoc genomicLocationOf(final SAMRecord read) {
|
|
||||||
return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final Map<Integer, String> readFlagNames
|
private static final Map<Integer, String> readFlagNames
|
||||||
= new HashMap<Integer, String>();
|
= new HashMap<Integer, String>();
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue