deleting accidentally committed junk

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4464 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2010-10-08 15:13:01 +00:00
parent b3d81984aa
commit 39e373af6e
14 changed files with 1940 additions and 5 deletions

View File

@ -88,7 +88,7 @@ public class VariantContextAdaptors {
return null;
Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
// add the reference allele
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);

View File

@ -45,6 +45,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
@ -84,6 +87,17 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false)
// boolean FORMAT_VCF = false;
@Argument(fullName = "genotype_intervals", shortName = "genotype",
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
public String genotypeIntervalsFile = null;
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
"if the list turns out to be unsorted, it will throw an exception. "+
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
"to sort and keep it in memory (increases memory usage!).")
protected boolean GENOTYPE_NOT_SORTED = false;
@Argument(fullName="somatic", shortName="somatic",
doc="Perform somatic calls; two input alignment files (-I option) must be specified. Calls are performed from the second file (\"tumor\") against the first one (\"normal\").", required=false)
boolean call_somatic = false;
@ -167,6 +181,11 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
private SAMRecord lastRead;
private byte[] refBases;
private ReferenceDataSource refData;
private Iterator<GenomeLoc> genotypeIntervals = null;
private GenomeLocSortedSet traverseIntervals = null; // these are the traversal intervals passed with -L option (if any)
// the current interval in the list of intervals, for which we want to do full genotyping
private GenomeLoc currentGenotypeInterval = null;
// "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
@ -272,6 +291,22 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
normalSamples.add(rg.getSample());
}
}
if ( genotypeIntervalsFile != null ) {
traverseIntervals = getToolkit().getIntervals();
if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) {
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
genotypeIntervals = new IntervalFileMergingIterator(
new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
} else {
// read in the whole list of intervals for cleaning
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(
IntervalUtils.parseIntervalArguments(Arrays.asList(genotypeIntervalsFile)), IntervalMergingRule.OVERLAPPING_ONLY);
genotypeIntervals = locs.iterator();
}
currentGenotypeInterval = genotypeIntervals.hasNext() ? genotypeIntervals.next() : null;
}
}

View File

@ -0,0 +1,69 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 3, 2010
* Time: 4:10:42 PM
* To change this template use File | Settings | File Templates.
*/
/** A simple utility class that encapsulates information about a single alignment (offset, strand, overlap, mismatch count).
*
*/
public class AlignmentInfo {
private int offset;
private int overlap;
private int mm ;
private Assembly a = null;
private static int RIDICULOUSLY_LARGE_NUMBER = 1000000000;
public AlignmentInfo() {
offset = 0;
mm = RIDICULOUSLY_LARGE_NUMBER;
overlap = -1;
a = null;
}
public AlignmentInfo(int mm, int offset, boolean isRc, int overlap, Assembly a) {
this.offset = (isRc ? (-offset-1) : offset );
this.overlap = overlap;
this.mm = mm;
this.a = a;
}
boolean isAligned() { return mm < RIDICULOUSLY_LARGE_NUMBER; }
public boolean isNegativeStrand() { return offset < 0; }
public Assembly getAssembly() { return a; }
public int getOffset() { return ( offset < 0 ? (-offset-1) : offset ); }
public int getMismatchCount() { return mm; }
public int getOverlap() { return overlap; }
public double getMismatchRate() { return isAligned() ? ((double)mm)/overlap : 1.0; }
}

View File

@ -0,0 +1,111 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
public class AlignmentList implements Iterable<AlignmentInfo> {
private int best_mm = 1000000000;
private int next_best_mm = 1000000000;
private List<AlignmentInfo> als = null;
private int next_best_count = 0;
private int best_overlap = 0;
private AlignmentStrategy strategy = null;
public AlignmentList(AlignmentStrategy s) {
this.strategy = s;
best_mm = 1000000000;
next_best_mm = 1000000000;
best_overlap = 0;
als = new ArrayList<AlignmentInfo>(1);
}
public boolean isAligned() {
return best_mm < 1000000000;
}
public List<AlignmentInfo> getAlignments() { return als; }
public int size() { return als.size(); }
public Iterator<AlignmentInfo> iterator() { return als.iterator(); }
// public void tryAdd(int mm, int offset, boolean isRc, int overlap) {
// tryAdd(new AlignmentInfo(mm,offset,isRc,overlap));
// }
public void tryAdd(AlignmentInfo ai) {
AlignmentStrategy.Action a = strategy.action(ai,this) ;
switch ( a ) {
case DISCARD: break;
case REPLACE_BEST:
next_best_mm = best_mm;
next_best_count = size();
als.clear();
als.add(ai);
best_mm = ai.getMismatchCount();
best_overlap = ai.getOverlap();
break;
case ADD_BEST:
als.add(ai);
if ( ai.getMismatchCount() < best_mm ) best_mm = ai.getMismatchCount();
if ( ai.getOverlap() > best_overlap) best_overlap = ai.getOverlap();
break;
case REPLACE_NEXTBEST:
next_best_mm = ai.getMismatchCount();
next_best_count = 1;
break;
case ADD_NEXTBEST:
next_best_count++;
if ( ai.getMismatchCount() < next_best_mm ) next_best_mm = ai.getMismatchCount();
break;
default: throw new StingException("Unrecognized action requested: "+a);
}
}
public void tryAddAll(AlignmentList al) {
for( AlignmentInfo ai : al) {
tryAdd(ai);
}
}
public int getBestMMCount() { return best_mm; }
public int getBestOverlap() { return best_overlap; }
public int getBestHitCount() { return als.size() ; }
public int getNextBestHitCount() { return next_best_count; }
public int getNextBestMMCount() { return next_best_mm; }
// public int getOverlap() { return overlap; }
// public int getOffset() { return offset; }
// public boolean isNegativeStrand() { return rc; }
// public double getMismatchRate() { return isAligned() ? ((double)best_mm)/overlap : 1.0 ; }
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 3, 2010
* Time: 2:53:43 PM
* To change this template use File | Settings | File Templates.
*/
public interface AlignmentStrategy {
enum Action {
DISCARD,
REPLACE_BEST,
ADD_BEST,
REPLACE_NEXTBEST,
ADD_NEXTBEST
};
public Action action(AlignmentInfo alignment, AlignmentList currentList);
}

View File

@ -0,0 +1,442 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import net.sf.samtools.util.StringUtil;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 3, 2010
* Time: 2:20:22 PM
* To change this template use File | Settings | File Templates.
*/
public class Assembly {
private byte[] consensus;
private short[] coverage;
private short[] mismatches;
private short [][] base_counts;
private boolean debug = false;
private List<String> seq_ids;
private List<byte []> seqs;
private List<Integer> seq_offsets;
private KmerIndex lookup; // assembled consensus sequence is indexed here
private int hookedAt = -1; // if set, specifies start on the ref of the assembled consensus sequence
private static List<PrimitivePair.Int> EMPTY_KMER_LIST = new ArrayList<PrimitivePair.Int>(0);
private int K = 15;
private AlignmentStrategy strategy = null;
/** Creates new assembly seeded with the specified sequence; default key length (15) is used.
*
* @param seq
* @param id
*/
public Assembly(final byte[] seq, String id) {
this(15,seq,id);
}
/** Creates new assembly seeded with the specified sequence and sets kmer (key) length K for the internally maintained
* lookup index tables.
* @param K
* @param seq
* @param id
*/
public Assembly(int K, final byte[] seq, String id) {
this.K = K;
seq_ids = new ArrayList<String>();
seq_offsets = new ArrayList<Integer>();
seqs = new ArrayList<byte[]>();
seq_ids.add(id);
seq_offsets.add(0);
seqs.add(seq);
consensus = Arrays.copyOf(seq,seq.length);
coverage = new short[seq.length];
Arrays.fill(coverage,(short)1);
mismatches = new short[seq.length]; // filled with 0's
base_counts = new short[4][seq.length];
for ( int i = 0 ; i < seq.length ; i++ ) {
int j = BaseUtils.simpleBaseToBaseIndex(seq[i]);
if ( j != -1) base_counts[j][i] = 1;
}
lookup = new KmerIndex(K,seq);
strategy = new DefaultAlignmentStrategy();
}
/** Creates new assembly seeded with the specified sequence; default key length (15) is used and the position on the
* reference of the entire assembly is set (as assemblly grows, position on the ref will be updated properly).
*
* @param seq
* @param id
*/
public Assembly(final byte[] seq, String id, int posOnRef) {
this(seq,id);
hookedAt = posOnRef;
}
/** Creates new assembly seeded the specified sequence and sets kmer (key) length K for the internally maintained
* lookup index tables. Parameter <code>posOnRef</code> specifies the (initial) position of the entire assembly on the
* ref; as the assembly grows, the position on ref will be updated properly.
* @param K
* @param seq
* @param id
*/
public Assembly(int K, final byte[] seq, String id, int posOnRef) {
this(K,seq,id);
hookedAt = posOnRef;
}
/** Returns total number of sequences currently held by this assembly.
*
* @return
*/
public int getNumSeqs() { return seqs.size() ; }
/** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
* the sequence to the consensus even if it aligns! This methods returns a list of alternative
* best alignments found (according to the strategy used) in a newly allocated AlignmentList object.
* @param seq sequence to align to this consensus
* @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
* only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
* along the assembled consensus sequence)
* @return a newly allocated alignment list; returned list can be empty if no alignments are found
*/
public AlignmentList align(final byte[] seq, boolean tryRC) {
return align(seq,tryRC,null);
}
/** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
* the sequence to the consensus even if it aligns! This method uses existing list of alignments
* (which can contain alignments to a different assembly) and updates it as necessary if a better alignment
* (or multiple better alignments) than the one(s) already held in the list is found. Reference to the
* <i>same</i> alignment list object is returned: this method modifies it's argument. If alignment list argument
* is <code>null</code>, new alignment list object will be allocated and returned by this method.
*
* @param seq sequence to align to this consensus
* @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
* only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
* along the assembled consensus sequence)
* @return a newly allocated alignment list; returned list can be empty if no alignments are found
*/
public AlignmentList align(final byte[] seq, boolean tryRC, AlignmentList a) {
if ( debug ) System.out.println("Assembly:: aligning sequence of length "+seq.length+"; tryRC="+tryRC+"; K="+K);
List<PrimitivePair.Int> fw_kmers = KmerIndex.toKeyOffsetList(K,seq);
if ( debug ) {
for( PrimitivePair.Int kmer: fw_kmers) {
System.out.println("id="+kmer.getFirst()+" seq="+new String(KmerIndex.idToSeq(K,kmer.getFirst()))+" offset on seq="+kmer.getSecond());
}
}
byte [] rc_seq = (tryRC ? BaseUtils.simpleReverseComplement(seq) : null );
List<PrimitivePair.Int> rc_kmers = (tryRC ? KmerIndex.toKeyOffsetList(K,rc_seq) : EMPTY_KMER_LIST );
if ( a == null ) a = new AlignmentList(strategy);
// i is the position on the sequence seq or on its reverse complement
for(PrimitivePair.Int kmer : fw_kmers ) {
List<Integer> offsets = lookup.getOffsets(kmer.first);
if ( offsets != null ) {
// kmer present in consensus sequence
for ( int s : offsets ) { // s=offset of the current kmer on the assembled consensus
int trial_offset = s - kmer.second; // offset of the seq on the assembled consensus suggested by current kmer/offset
int trial_mm = countMismatches(seq,trial_offset,a.getNextBestMMCount());
a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,false,overlap(trial_offset,seq.length),this));
}
}
}
for ( PrimitivePair.Int kmer : rc_kmers ) {
List<Integer> offsets = lookup.getOffsets(kmer.first);
if ( offsets != null ) {
// kmer present in consensus sequence
for ( int s : offsets ) {
int trial_offset = s - kmer.second;
int trial_mm = countMismatches(rc_seq,trial_offset,a.getNextBestMMCount());
a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,true,overlap(trial_offset,seq.length),this));
}
}
}
return a;
}
public void setDebug(boolean d) { this.debug = d; lookup.setDebug(d);}
public int numSequences() { return seq_ids.size(); }
private int overlap(int offset, int seq_length ) {
return Math.min(consensus.length,offset+seq_length)-Math.max(0,offset);
}
private int countMismatches(final byte seq[], int offset, int cutoff) {
int mm = 0;
int i ;
if ( offset >= 0 ) i = 0;
else { i = (-offset); offset = 0; }
for ( ; i < seq.length && offset < consensus.length ; i++ , offset++ ) {
if ( seq[i] != consensus[offset] ) {
mm++;
if ( mm > cutoff ) break;
}
}
return mm;
}
public byte[] getConsensus() { return consensus; }
public int getPosOnRef() { return hookedAt; }
public int getConsensusLength() { return consensus.length; }
public List<Integer> getOffsets() { return seq_offsets; }
public int getOffset(int i) { return seq_offsets.get(i); }
public List<String> getSeqIds() { return Collections.unmodifiableList(seq_ids); }
/** Adds specified sequence to this assembly according to the provided
* alignment information. Will properly update consensus sequence of this assembly
* and all associated information (mismatches, base counts etc)
* @param seq
* @param id
* @param a
*/
public void add(final byte[] seq, String id, AlignmentInfo a) {
if ( ! a.isAligned() ) throw new StingException("Can not add sequence to the assembly: provided alignment is empty");
seq_ids.add(id);
int offset = a.getOffset();
int oldConsensusLength = consensus.length;
byte [] seq_to_add = ( a.isNegativeStrand() ? BaseUtils.simpleReverseComplement(seq) : seq);
seqs.add(seq_to_add);
int pos_on_seq = 0;
int pos_on_cons = 0;
int leftExtension = 0; // how many bases we added to the consensus on the left
int rightExtension = 0; // how many bases we added to the consensus on the right
if ( offset < 0 ) {
// if sequence sticks out to the left of the current consensus:
leftExtension = -offset;
for(int i = 0 ; i < seq_offsets.size() ; i++ ) {
// we are going to extend consensus to the left, so we need to update all current offsets:
seq_offsets.set(i,seq_offsets.get(i)+leftExtension);
}
if ( hookedAt > 0 ) hookedAt -= leftExtension;
// extend consensus and associated arrays to the left :
consensus = Utils.extend(consensus,offset,(byte)0); // remember, offset is negative here, extending to the left
coverage = Utils.extend(coverage,offset,(short)1) ;
mismatches = Utils.extend(mismatches,offset,(short)0);
for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],offset,(short)0);
for ( int j = 0 ; j < -offset ; j++ ) {
consensus[j] = seq_to_add[j];
int b = BaseUtils.simpleBaseToBaseIndex(seq_to_add[j]);
if ( b != -1 ) base_counts[b][j]=1;
}
pos_on_seq = pos_on_cons = -offset;
offset = 0;
}
if ( offset > 0 ) pos_on_cons = offset;
seq_offsets.add(offset);
boolean consensus_changed = false;
for ( ; pos_on_seq < seq_to_add.length && pos_on_cons < consensus.length ; pos_on_seq++, pos_on_cons++ ) {
coverage[pos_on_cons]++;
final byte base = seq_to_add[pos_on_seq];
final int b = BaseUtils.simpleBaseToBaseIndex(base);
if ( b != -1 ) {
// if base on seq is not a regular base, there is nothing to do;
// otherwise count mismatches and optionally update consensus if current base tips the balance
base_counts[b][pos_on_cons]++;
int maxcount = 0;
int maxb = -1;
for ( int j = 0 ; j < 4 ; j++ ) {
if ( base_counts[j][pos_on_cons] > maxcount ) {
maxcount = base_counts[j][pos_on_cons];
maxb = j;
}
}
// we are guaranteed here that maxb != -1 since we just added one regular base (the current one)
// few lines above...
byte newbase = BaseUtils.baseIndexToSimpleBase(maxb);
if ( newbase != consensus[pos_on_cons] ) { // need to change the consensus base (will recompute mismatches)
consensus[pos_on_cons] = newbase;
consensus_changed = true;
mismatches[pos_on_cons] = 0;
for ( int i = 0 ; i < 4 ; i++ ) {
if ( i == maxb ) continue;
mismatches[pos_on_cons] += base_counts[i][pos_on_cons];
}
} else { // consensus base did not change; just increment mismatches if current sequence's base differs from consensus
if ( base != consensus[pos_on_cons]) mismatches[pos_on_cons]++;
}
}
}
// Last step: if sequence sticks out of current consensus on the right, we need to extend the latter:
if ( pos_on_seq < seq_to_add.length ) {
// sequence sticks out of consensus to the right
rightExtension = seq_to_add.length - pos_on_seq;
consensus = Utils.extend(consensus,rightExtension,(byte)0);
coverage = Utils.extend(coverage,rightExtension,(short)1);
mismatches = Utils.extend(mismatches,rightExtension,(short)0);
for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],rightExtension,(short)0);
for ( ; pos_on_seq < seq_to_add.length ; pos_on_seq++, pos_on_cons++ ) {
byte base = seq_to_add[pos_on_seq];
consensus[pos_on_cons] = base;
int b = BaseUtils.simpleBaseToBaseIndex(base);
if ( b != -1 ) base_counts[b][pos_on_cons] = base;
}
}
// finally, the new sequence we just added could have mismatches that tip some consensus bases into new values;
// let's catch those cases:
for ( int i = 0 ; i < consensus.length ; i++ ) {
byte cons_base = consensus[i];
int b = BaseUtils.simpleBaseToBaseIndex(cons_base);
}
// there is probably a better way, but for now we just recompute the whole lookup table when consensus
// changes somewhere in the middle (if we want to be samrt we need to identify just the kmers that changed
// and find/change them in the lookup table).
if ( consensus_changed ) {
lookup.clear();
lookup.index(consensus);
} else {
if ( leftExtension > 0 || rightExtension > 0 ) lookup.updateIndex(consensus,leftExtension,oldConsensusLength);
}
}
public String toAlignmentString(boolean mismatchesOnly, boolean printNames) {
int maxNameLength = 0;
int spacing=3;
if ( printNames ) {
for ( String n : seq_ids ) if ( n.length() > maxNameLength ) maxNameLength++;
}
StringBuilder b = new StringBuilder();
if ( printNames ) b.append(Utils.dupString(' ',maxNameLength+spacing));
b.append(new String(consensus));
b.append('\n');
for ( int j = 0; j < seqs.size() ; j++ ) {
int offset = seq_offsets.get(j);
byte [] seq = seqs.get(j);
if ( printNames ) {
b.append(seq_ids.get(j));
b.append(Utils.dupString(' ',maxNameLength-seq_ids.get(j).length()+spacing));
}
for ( int i = 0 ; i < offset ; i++ ) b.append(' ');
for ( int i = 0 ; i < seq.length ; i++ ) {
byte base = seq[i];
if ( mismatchesOnly && base == consensus[i+offset] ) {
b.append('.');
} else b.append((char)base);
}
b.append('\n');
}
return b.toString();
}
public static void testMe(String [] argv ) {
byte [] seq1 = "ACGTTGCGTGGTTCACTGCAGTAACTGACTGATGCA".getBytes();
byte [] seq2 = "GCGTGGTTTACTGCAGTAACTGACTGATGCAACGTGTTTG".getBytes();
byte [] seq3 = "GGNTGACGTTGCGTGGTTTACTGCAGTAACTGACT".getBytes();
byte [] seq4 = "NNNTTNCGTGGTTTACTGCAGTAACTGACTGATGCA".getBytes();
Assembly a = new Assembly(seq1,"1");
AlignmentList al = a.align(seq2,false);
if ( al.isAligned() ) System.out.println("seq 2 aligned");
else System.out.println("seq 2 did NOT align");
if ( al.size() == 1 ) a.add(seq2,"2",al.getAlignments().get(0));
else System.out.println("Multiple alignments found for seq 2");
al = a.align(seq3,false);
if ( al.isAligned() ) System.out.println("seq 3 aligned");
else System.out.println("seq 3 did NOT align");
if ( al.size() == 1 ) a.add(seq3,"3",al.getAlignments().get(0));
else System.out.println("Multiple alignments found for seq 3");
al = a.align(seq4,false);
if ( al.isAligned() ) System.out.println("seq 4 aligned");
else System.out.println("seq 4 did NOT align");
if ( al.size() == 1 ) a.add(seq4,"4",al.getAlignments().get(0));
else System.out.println("Multiple alignments found for seq 4");
System.out.println(a.toAlignmentString(true, true));
}
}

View File

@ -0,0 +1,67 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.List;
import java.util.LinkedList;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Sep 13, 2010
* Time: 5:53:33 PM
* To change this template use File | Settings | File Templates.
*/
public class AssemblyGraph {
private List<Assembly> sources;
public AssemblyGraph(Assembly a) {
sources = new LinkedList<Assembly>();
sources.add(a);
}
/** Initializes assembly from the single specified read, and sets this assembly as the root of this
* assembly graph
* @param r read; must be aligned, otherwise exception will be thrown
* @param K index (Kmer) size of the assembly that will be initialized with the read r
*/
public AssemblyGraph(SAMRecord r, int K) {
if (AlignmentUtils.isReadUnmapped(r))
throw new StingException("Can not initialize assembly graph with unaligned read");
sources = new LinkedList<Assembly>();
sources.add( new Assembly(K,r.getReadBases(),r.getReadName(), r.getAlignmentStart()) );
}
public void add(SAMRecord r) {
}
}

View File

@ -0,0 +1,52 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import org.broadinstitute.sting.utils.exceptions.StingException;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 3, 2010
* Time: 4:53:01 PM
* To change this template use File | Settings | File Templates.
*/
public class DefaultAlignmentStrategy implements AlignmentStrategy {
public Action action(AlignmentInfo alignment, AlignmentList currentList) {
if ( currentList.size() == 0 ) return Action.REPLACE_BEST;
if ( alignment.getMismatchCount() > currentList.getNextBestMMCount() ) return Action.DISCARD;
if ( alignment.getMismatchCount() < currentList.getBestMMCount() ) return Action.REPLACE_BEST;
if ( alignment.getMismatchCount() == currentList.getBestMMCount() ) return Action.ADD_BEST;
if ( alignment.getMismatchCount() < currentList.getNextBestMMCount() ) return Action.REPLACE_NEXTBEST;
if ( alignment.getMismatchCount() == currentList.getNextBestMMCount() ) return Action.ADD_NEXTBEST;
throw new StingException("Unexpected case found and left unprocessed");
// return null; //To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -0,0 +1,332 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 3, 2010
* Time: 1:31:15 PM
* To change this template use File | Settings | File Templates.
*/
public class KmerIndex {
private HashMap<Integer,List<Integer> > lookup;
private int K = -1;
private int mask = 0;
private boolean debug = false;
/**
* Translates sequence <code>seq</code> into the list of all valid kmers paired
* with their offsets on that sequence. Valid kmer is a kmer that contains only ACGT bases; if <code>seq</code>
* contains any other symbols, no kmers overlapping with such symbols will be generated. This method
* returns a linear (possibly gapped if non-ACGT symbols are present) representation of the sequence as its kmers
* with corresponding offsets, NOT a lookup index. If a specific kmer occurs on the sequence N times,
* the returned list will have N occurences of this kmer, each paired with one unique location on the sequence.
* Kmers themselves are represented as integer kmer ids here, see #idToSeq() if string (ACGT bases) representation
* of kmers is needed. Empty list if returned if no valid kmers are found on the sequence (i.e. too many non-ACGT bases)
*
* @param K key (kmer) length
* @param seq sequence to translate into kmer/offset representation
* @return list of kmer/offsets
*/
public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte seq[]) {
return toKeyOffsetList(K,seq,0,seq.length);
}
/** Same as #toKeyOffsetList(int K, byte [] seq) (see docs), except that this method is not static and
* uses key length K associated with the specific instance of the KmerIndex class.
* @param seq
* @return
*/
public List<PrimitivePair.Int> toKeyOffsetList(byte [] seq) {
return toKeyOffsetList(this.K,seq);
}
/** Returns an ordered sequence of overlapping (1-shift in the ideal case) k-mers of length K found in the subsequence of
* length <code>length</code> of sequence <code>seq</code> starting at position <code>start</code>. All returned kmers
* are fully subsumed by the interval [start, start+length) on the sequence <code>seq</code> (no partial overlaps).
* Each kmer is paired with its offset on the (full length) <code>seq</code> in the returned list.
* Note that only k-mers on the forward strand are returned. You need to manually rc the string and
* call toKeyOffsetList() again to get rc k-mers. If sequence contains any other symbols than ACGT, all k-mers
* that would overlap with those symbols will be skipped (not present in the returned list). See also
* #toKeyOffsetList(int K, byte [] seq) which translates the whole sequence <code>seq</code>.
*
* @param K index key (k-mer) length
* @param seq sequence to compute k-mers from
* @param start compute kmers for subsequence [start, start+length) of <code>seq</code>
* @param length compute kmers for subsequence [start, start+length) of <code>seq</code>
* @return a list of pairs (kmer,offset_on_the_seq) for each valid kmer (i.e. kmer that does not overlap with
* non-ACGT bases); if no valid kmers exist, the returned list will be empty.
*/
public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte[] seq, int start, int length) {
if ( length < K ) throw new StingException("Can not index sequence that is shorter than key length: total seq="+seq.length+"; start="+start+"; length="+length);
int mask = 0;
if ( length > K ) {
for ( int i = 0; i < K ; i++ ) {
mask <<= 2;
mask |= 0x03;
}
}
int key = 0;
int i ;
final int final_pos = start+length; // first base *after* the last position we want to index
ArrayList<PrimitivePair.Int> l = new ArrayList<PrimitivePair.Int>(length-K+1);
PrimitivePair.Int firstK = toFirstKey(K,seq,start,final_pos);
if ( firstK == null ) {
// ooops, too many non-ACGT bases, we were not able to find a single valid k-mer on the whole sequence!
return l;
}
l.add(firstK);
start = firstK.getSecond();
i = start + K; // i points to the first base after the returned kmer firstK
key = firstK.getFirst();
// now let's try recomputing next kmers in an efficient way: we reuse previous kmer and add only the new last base.
// This will break if we encounter a non-ACGT base, in which case we will have to start over.
for ( start++ ; i < final_pos ; i++, start++ ) {
int d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
if ( d == -1 ) {
// ooops, we ran into a bad base; let's jump over it completely and reinitialize the key
// (since all kmers overlapping with the current base are invalid)
firstK = toFirstKey(K,seq,i+1,final_pos);
if ( firstK == null ) break; // no more valid kmers
l.add(firstK);
start = firstK.getSecond();
i = start+K; // points to the base right after the Kmer we just found
key = firstK.getFirst(); // reset key to the new kmer we just found
} else {
// the base is good, so we can compute our new kmer very efficiently using the old one:
key <<= 2;
key &= mask;
key += d;
l.add(new PrimitivePair.Int(key,start));
}
}
return l;
}
/** Non-static version of #toKeyOffsetList(int K, byte [] seq, int start, int length) (see docs), which
* uses key length K associated with this instance of the KmerIndex object.
* @param seq
* @param start
* @param length
* @return
*/
public List<PrimitivePair.Int> toKeyOffsetList(byte[] seq, int start, int length) {
return toKeyOffsetList(this.K,seq,start,length);
}
/** Computes index (key) of the first valid kmer in the interval [start,stop) of the sequence seq. Kmer is valid
* if it contains only valid (ACGT) bases. Returns key and actual offset of first such kmer found, or null
* if such kmer does not exist (i.e. if seq does not contain a continuous span of ACGT bases at least K bases long).
* @param K
* @param seq
* @param start
* @param stop
* @return
*/
private static PrimitivePair.Int toFirstKey(int K, byte[] seq, int start, int stop) {
int d = -1;
int key = 0 ;
while ( d == -1 && start < stop - K + 1) {
key = 0;
for ( int i = start ; i < start+K; i++ ) {
key <<= 2;
d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
if ( d == -1) {
// ooops, non-ACGT base found, abort and start over. Next kmer that
// have a chance to be valid (contain only ACGT bases) can start only after the current position:
start = i+1;
break;
}
key += d;
}
} // got the first key
if ( d != -1 ) return new PrimitivePair.Int(key,start);
else return null;
}
/** Creates an empty kmer index table with specified key length
*
* @param K
*/
public KmerIndex(final int K) {
if ( K > 16 ) throw new StingException("Lookup keys longer than 16 bases are currently not supported");
if ( K % 2 == 0 ) throw new StingException("Even keys require additional processing of palindromes, currently not supported. Please use odd key.");
this.K = K;
mask = 0;
for ( int i = 0; i < K; i++ ) {
mask <<= 2;
mask |= 0x03;
} // got the first key
lookup = new HashMap<Integer,List<Integer>>();
}
/** Builds kmer index table with key length <code>K</code> for the sequence <code>seq</code>.
*
* @param K
* @param seq
*/
public KmerIndex(final int K, final byte[] seq) {
this(K);
if ( seq.length < K ) throw new StingException("Sequence is shorter than requested lookup index key length");
addToIndex(toKeyOffsetList(K,seq,0,seq.length));
}
public void setDebug(boolean d) { this.debug = d; }
/** Clears current lookup index table completely (but preserves the key length previously set).
*
*/
public void clear() { lookup.clear(); }
/** Builds complete index for the sequence seq. This method can be used only when lookup table is
* empty (i.e. use #clear() first), otherwise an exception will be thrown.
* @param seq
*/
public void index(final byte[] seq) {
if ( ! lookup.isEmpty() ) {
throw new StingException("Can not index new sequence: lookup table is already non-empty");
}
addToIndex(toKeyOffsetList(K,seq,0,seq.length));
}
/**
* Updates existing index. It is assumed that the sequence that was already indexed by this KmerIndex object is
* the exact subsequence of length <code>old_length</code> of the new sequence <code>seq</code>, starting at
* position <code>old_start</code>. No checks are performed, so it is the responsibility of the caller to ensure
* that this is indeed the case, otherwise the index will be inconsistent. Since the old sequence is a part
* of the new one, this method will keep all the already computed kmers (and update their offsets as needed),
* and compute and add kmers/offsets for all the novel bases added to the sequence <code>seq</code> compared
* to the old, already indexed subsequnce. If <code>old_length</code> is less than
* K (i.e. old sequence could not be and was not indexed at all), the new sequence <code>seq</code> will
* be fully indexed from start to end.
* @param seq
* @param old_start already indexed subsequence starts at this position in <code>seq</code>
* @param old_length length of the already indexed subsequence
*/
public void updateIndex(final byte[] seq, final int old_start, final int old_length) {
if ( old_length < K ) {
if ( ! lookup.isEmpty())
throw new StingException("It is claimed that old indexed sequence is shorter than K (i.e. it could not be indexed), but index is non empty");
addToIndex( toKeyOffsetList(K,seq,0,seq.length));
return;
}
if ( old_start > 0 ) {
// update positions of previously indexed k-mers:
for ( Map.Entry<Integer,List<Integer>> e : lookup.entrySet() ) {
List<Integer> l = e.getValue();
for ( int i = 0 ; i < l.size(); i++ ) l.set(i,l.get(i)+old_start);
}
// take care of additional k-mers appearing *before* the already indexed subsequence:
// if already indexed subsequence starts at 'start', the first k-mer from that sequence
// ends at start+K-1 (inclusive) and it is obviously already indexed. So the last k-mer we want to index now ends at
// start+K-2 (inclusive), the length of [0,start+K-2] interval that we need to index is
// start+K-1.
addToIndex( toKeyOffsetList(K,seq,0,old_start+K-1) );
}
// the last k-mer we already indexed ends at start+length-1 (inclusive); so it starts at start+length-1-(K-1)=start+length-K.
// Hence, the first k-mer that is not indexed yet starts at start+length-K+1. The length of the subsequence that
// we need to index, [start+length-K+1,seq.length) is seq.length - start - length +K - 1
int pos = old_start+old_length-K+1;
addToIndex( toKeyOffsetList(K,seq,pos,seq.length-pos) );
}
/** Convenience shortcut: takes the list of keys/offsets and pushes offsets into the lookup index for the keys that
* do exist already, or first creates the new entry and then pushes the offset for keys that are novel. This method
* is quiet: if <code>keys</code> is <code>null</code> or an empty list, it does nothing.
* @param keys
*/
private void addToIndex(final List<PrimitivePair.Int> keys ) {
if ( keys == null ) return;
for ( PrimitivePair.Int key: keys ) {
List<Integer> l = lookup.get(key.getFirst());
if ( l == null ) {
l = new ArrayList<Integer>();
lookup.put(key.getFirst(),l);
}
l.add(key.getSecond());
}
}
/**
* Converts kmer (integer key) of length K into its sequence representation. Returns a sequence (over ACGT alphabet)
* of length K that corresponds to the specified key.
* @param K
* @param kmer
* @return
*/
public static byte [] idToSeq(int K, int kmer) {
byte [] seq = new byte[K];
for ( int i = K-1; i >=0 ; i-- ) {
seq[i] = BaseUtils.baseIndexToSimpleBase(kmer & 0x3);
kmer >>= 2;
}
return seq;
}
/** Returns all offsets for the specified kmer (key) on the sequence indexed by this KmerIndex object. Returns
* null if specified kmer is not present on the indexed sequence.
* @param key
* @return
*/
public List<Integer> getOffsets(int key) { return lookup.get(key); }
// public List<Integer> getOffsets(byte[] seq) {
// if ( seq.length != K ) throw new StingException("Can not perform direct lookup of a sequence with length different from key size");
//
// return getOffsets( toKey(seq) ) ;
// }
}

View File

@ -0,0 +1,401 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.utils;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Aug 6, 2010
* Time: 6:18:01 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadPair {
public enum PairType {
UNKNOWN,
BOTH_UNMAPPED,
ONE_UNMAPPED,
PROPER,
LEFT,
RIGHT,
OUTER,
INTER
};
private SAMRecord end1 = null;
private SAMRecord end2 = null;
private PairType pType = PairType.UNKNOWN;
private int leftStart = -1;
private int rightStart = -1;
private SAMRecord leftRead = null;
private SAMRecord rightRead = null;
/** Creates an empty read pair object */
public ReadPair() {}
/** Creates a read pair objects initialized with the specified read */
public ReadPair(SAMRecord read) {
addRead(read);
}
/** Returns name of the paired read (it is assumed that both individual reads in the pair share same name).
*
* @return
*/
public String getName() { return ( end1 != null ? end1.getReadName() : (end2 != null ? end2.getReadName() : null) ); }
/** Returns true if both ends are recorded in this read pair object. Note that because SAM records carry
* mate information, a pair can be (partially) initialized from one end. This method verifies that this is not the case
* and both records are actually present.
* @return
*/
public boolean hasBothEnds() { return end1 != null && end2 != null ; }
/** Returns true if this pair object was initialized with at least one end. Since SAM records carry mate information,
* it is sometimes sufficient to have only one read (fragment end) actually recorded in the pair object, at which
* point some useful information can be retrieved for the pair already.
* @return
*/
public boolean hasAnyData() { return end1 != null || end2 != null ; }
/** Returns true if both ends in the pair are mapped. The pair object must be at least partially initialized (i.e.
* it has to hold a reference to at least one end of the pair), otherwise an exception will be thrown.
* @return
*/
public boolean bothEndsMapped() {
if ( pType == PairType.UNKNOWN ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
if ( pType == PairType.BOTH_UNMAPPED || pType == PairType.ONE_UNMAPPED ) return false;
return true;
}
/** Returns true if both ends in the pair are mapped uniquely. This method requires both ends being already registered
* in this pair object (i.e. hasBothEnds() is true), otherwise an exception will be thrown.
* @return
*/
public boolean bothEndsUniquelyMapped() {
if ( ! hasBothEnds() ) throw new StingException("Can not determine if both ends are uniquely mapped until both ends are recorded");
return bothEndsMapped() && end1.getMappingQuality() > 0 && end2.getMappingQuality() > 0;
}
/** Returns true if this pair is in proper orientation, i.e. ---> <--- on the same contig */
public boolean isProper() { return pType == PairType.PROPER; }
/* Returns true if this pair is in outer orientation, i.e. <--- ---> on the same chromosome */
public boolean isOuter() { return pType == PairType.OUTER; }
/** Returns left (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
* onto the same contig, otherwise an exception will be thrown.
* @return
*/
public SAMRecord getLeftRead() {
if ( ! bothEndsMapped() || pType == PairType.INTER )
throw new StingException("Left read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
if ( leftRead == null )
throw new StingException("Left read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
return leftRead;
}
/** Returns right (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
* onto the same contig, otherwise an exception will be thrown.
* @return
*/
public SAMRecord getRightRead() {
if ( ! bothEndsMapped() || pType == PairType.INTER )
throw new StingException("Right read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
if ( rightRead == null )
throw new StingException("Right read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
return rightRead;
}
public SAMRecord getEnd1() { return end1; }
public SAMRecord getEnd2() { return end2; }
public PairType getPairType() { return pType ; }
public void addRead(SAMRecord r) {
if ( ! r.getReadPairedFlag() ) throw new StingException("Read "+r.getReadName() +" is unpaired");
if ( r.getFirstOfPairFlag() ) {
if ( end1 != null ) throw new StingException("Read "+r.getReadName()+" is first of pair and the pair already has first read recorded");
end1 = r;
if ( end2 != null && ! end1.getReadName().equals(end2.getReadName()) )
throw new StingException("The pair already has read "+end2.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
} else {
if ( r.getSecondOfPairFlag() ) {
if ( end2 != null ) throw new StingException("Read "+r.getReadName()+" is second of pair and the pair already has second read recorded");
end2 = r;
if ( end1 != null && ! end1.getReadName().equals(end2.getReadName()) )
throw new StingException("The pair already has read "+end1.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
} else {
throw new StingException("The read "+r.getReadName()+" is marked as paired, but the first/second of pair flag is not set");
}
}
setPairInfo(r);
}
/** If pair type has not been set yet, then sets it to <code>t</code>. Otherwise (pair type already set),
* just checks if the pair type is <code>t</t>. If it is, the method returns quietly; if it is not (inconsistency detected),
* throws an exception.
*
*/
private void setCheckPairType(PairType t) {
if ( pType != PairType.UNKNOWN ) {
if ( pType != t )
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
} else pType = t;
}
private void setCheckLeftStart(int pos) {
if ( leftStart >= 0 ) {
if ( leftStart != pos )
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
} else leftStart = pos;
}
private void setCheckRightStart(int pos) {
if ( rightStart >= 0 ) {
if ( rightStart != pos )
throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
} else rightStart = pos;
}
private void setPairInfo(SAMRecord read) {
setCheckPairType(getPairType(read));
// there is nothing left to do unless both ends are mapped onto the same contig:
if ( pType == PairType.INTER ) return;
if ( pType == PairType.ONE_UNMAPPED ) {
// set putative left or right read depending on the orientation of the only mapped mate
if ( ! AlignmentUtils.isReadUnmapped(read ) ) {
// we can set left/right read only if it is the current read that is mapped; if we have the
// unmapped mate, skip and wait for the mapped read to come!
if ( read.getReadNegativeStrandFlag() ) {
setCheckRightStart(read.getAlignmentStart());
if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
rightRead = read;
} else {
setCheckLeftStart(read.getAlignmentStart());
if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
leftRead = read;
}
}
return;
}
// we are here if both ends are mapped and they map onto the same contig
if ( read.getAlignmentStart() < read.getMateAlignmentStart() ) { //left/right = read/mate
setCheckLeftStart(read.getAlignmentStart());
setCheckRightStart(read.getMateAlignmentStart());
if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
leftRead = read;
} else {
// left/right = mate/read
setCheckLeftStart(read.getMateAlignmentStart());
setCheckRightStart(read.getAlignmentStart());
if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
rightRead = read;
}
}
/** Returns pair type that describes this read and its mate. The alignment information for both the read itself
* and its mate is taken from the read's sam record passed as the argument, so the mate information is expected to be
* correctly set!
* @param read
* @return
*/
public static PairType getPairType(SAMRecord read) {
if ( AlignmentUtils.isReadUnmapped(read) ) {
if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.BOTH_UNMAPPED;
else return PairType.ONE_UNMAPPED;
}
return getWouldBePairType(read,read.getReferenceIndex(),read.getAlignmentStart(),read.getReadNegativeStrandFlag());
}
/** Returns pair type that would describe this read and its mate, if this read mapped onto refId:start in orientation
* given by rc (forward is rc=false, reverse is rc=true). The read's alignment information (if any,
* unmapped reads are allowed) present in the SAM record is completely ignored by this method,
* only mate's information is used.
* @param read
* @param refId
* @param start
* @param rc
* @return
*/
public static PairType getWouldBePairType(SAMRecord read, int refId, int start, boolean rc) {
if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.ONE_UNMAPPED ;
// both read and mate are mapped:
if ( refId != read.getMateReferenceIndex() ) return PairType.INTER;
// both read and its mate map onto the same chromosome
if ( start < read.getMateAlignmentStart() ) { //left/right = read/mate
if ( rc ) {
if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
else return PairType.OUTER;
} else {
if ( read.getMateNegativeStrandFlag() ) return PairType.PROPER;
else return PairType.RIGHT;
}
} else {
// left/right = mate/read
if ( rc ) {
if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
else return PairType.PROPER;
} else {
if ( read.getMateNegativeStrandFlag() ) return PairType.OUTER;
else return PairType.RIGHT;
}
}
}
public int getLeftStart() {
if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
return leftStart;
}
public int getRightStart() {
if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
return rightStart;
}
public int getFragmentSize() {
if ( ! hasBothEnds() ) throw new StingException("Can not determine fragment size: pair object does not have both ends yet");
if ( ! bothEndsMapped() ) throw new StingException("Can not determine fragment size: both ends must be mapped");
if ( pType != PairType.PROPER ) throw new StingException("The pais is not in proper orientation, can not determine fragment size");
return getFragmentSize(leftRead,rightRead);
}
/** Given a read (that must belong to this pair), returns the other end in the pair if it is already
* recorded, or null otherwise.
* @param read
* @return
*/
public SAMRecord getOtherEnd(SAMRecord read) {
if ( read.getFirstOfPairFlag() ) return end2;
else {
if ( read.getSecondOfPairFlag() ) return end1;
}
return null;
}
public static int getFragmentSize(SAMRecord left, SAMRecord right) {
if ( left == null || right == null ||
AlignmentUtils.isReadUnmapped(left) || AlignmentUtils.isReadUnmapped(right) ) {
throw new StingException("No read (null) or unmapped read provided: fragment size is not defined");
}
if ( left.getReferenceIndex() != right.getReferenceIndex() ) {
throw new StingException("Left/right reads map onto different contigs: fragment size is not defined");
}
int fragment_length = left.getReadLength(); // fragment is at least as long as the left read, duh!
int leftEnd = left.getAlignmentEnd();
int rightStart = right.getAlignmentStart();
if ( rightStart > leftEnd ) {
// if reads are not overlapping, fragment length is lengths of both reads plus the distance (gap) between
// the reads. Note that if the sequence between the reads happens to have insirtions or deletions,
// our estimation of the actual distance between the reads (on the fragment) is incorrect, but we
// can not do better given just those reads. This estimation is, in particular, incorrect
// for left reads ending with 'I' and/or right reads starting with 'I'
//
// left right
// -------->...gap...<-------- fragment = left+gap+right
return left.getReadLength() + right.getReadLength() + (rightStart - leftEnd-1);
}
// if we are here, the reads do overlap; fragment length is lengths of the two reads less the overlap.
// in this case we can compute the actual overlap between the reads (on the fragment) taking into
// account indels, if any
//
// left **** right
// ------------> ****=overlap; fragment = left+right - overlap
// <--------------
//
// with deletion:
//
// left ** ** right
// -----------ddd-> ****=overlap; fragment = left+right - overlap
// <-ddd------------- note that overlap != leftEnd - rightStart+1
// instead, overlap = leftEnd-rightStart+1- length(D)
// with insertion:
//
// left ******* right ******* = overlap; fragment = left+right - overlap
// -------------iii-> note that overlap != leftEnd - rightStart +1
// <-iii-------------- instead, overlap = leftEnd - rightStart +1 + length(I)
// (since 'i' bases are NOT on the ref)
int posOnRef = rightStart;
// int posOnRightRead = 0;
int overlap = leftEnd - rightStart + 1 ;
for(CigarElement ce : left.getCigar().getCigarElements() ) {
switch(ce.getOperator()) {
case S:
case H:
// posOnRightRead+=ce.getLength();
break;
case I:
overlap += ce.getLength();
break;
case D:
case N:
overlap -= ce.getLength();
case M:
posOnRef += ce.getLength();
break;
default:
}
if ( posOnRef > leftEnd ) break; // we need to examine only overlapping part of the reads
}
return left.getReadLength() + right.getReadLength() - overlap;
}
}

View File

@ -0,0 +1,195 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
import org.broadinstitute.sting.oneoffprojects.utils.ReadPair;
import org.broadinstitute.sting.oneoffprojects.utils.AlignmentInfo;
import org.broadinstitute.sting.oneoffprojects.utils.Assembly;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import net.sf.samtools.SAMRecord;
import java.util.Map;
import java.util.HashMap;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Sep 13, 2010
* Time: 12:45:42 PM
* To change this template use File | Settings | File Templates.
*/
@WalkerName("DetectWGA")
@Requires(value={DataSource.REFERENCE_BASES})
public class DetectWGAWalker extends ReadWalker<Integer,Integer> {
private int TIP_LENGTH = 10;
private int TIP_MM_THRESHOLD = 1;
private double TIP_AV_QUAL_THRESHOLD = 15.0;
private boolean DEBUG = true;
Map<String, ReadPair> pairCache = null;
Map<String,MathUtils.RunningAverage> fragmentSizeMap = null; // by library
private ReferenceDataSource refData;
private byte[] refBases;
@Override
public void initialize() {
refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
pairCache = new HashMap<String, ReadPair>();
fragmentSizeMap = new HashMap<String,MathUtils.RunningAverage>();
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( ! read.getReadPairedFlag() ) return 0; // for now!!
// read is paired
cacheReadAsPair(read);
/*
// if the read is already mapped (uniquely), we check if it may have the "colored tip" artifact on either side:
if ( AlignmentUtils.isReadUniquelyMapped(read) ) {
TipInfo tips = countTipMismatches(read,TIP_LENGTH);
if ( tips.leftMM() >= TIP_MM_THRESHOLD || tips.rightMM() >= TIP_MM_THRESHOLD ) {
if ( DEBUG ) {
out.println(" Read "+read.getReadName()+ " has "+tips.leftMM()+"/"+tips.rightMM()+" mismatches in the tips");
out.println(" Pair orientation: "+pair.getPairType());
}
// try adding read to existing assemblies:
AlignmentInfo al = alignToAllAddToBest(read,Math.min(3,tips.leftMM()+tips.rightMM())-1);
if ( al == null ) {
if ( tips.leftMM() >= TIP_MM_THRESHOLD && tips.leftQ() >= TIP_AV_QUAL_THRESHOLD ||
tips.rightMM() >= TIP_MM_THRESHOLD && tips.rightQ() >= TIP_AV_QUAL_THRESHOLD ) {
if ( DEBUG ) out.println(" Initialized new assembly.") ;
Assembly a = new Assembly(read.getReadBases(),read.getReadName(),read.getAlignmentStart());
tryAndAddUnmapped(a); // see if we got unmapped reads that would align nicely
assemblies.add(a);
}
}
}
return 1;
}
*/
return null; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
/** little helper: if we already cached the pair object for this read, just add the read to that object; if we did not - instantiate
* new pair objetc first and register it in the map, then add the read; this method also updates other cache(s)/trackers as needed, e.g.
* fragment size map
*/
private void cacheReadAsPair(SAMRecord read) {
ReadPair pair = pairCache.get( read.getReadName() );
if ( pair == null ) {
pair = new ReadPair(read);
pairCache.put(read.getReadName(),pair);
}
pair.addRead(read);
// if it's a good pair, add its fragment size to the stats:
if ( pair.hasBothEnds() && pair.bothEndsMapped() && pair.isProper() ) {
String lib = read.getReadGroup().getLibrary();
MathUtils.RunningAverage fSize = fragmentSizeMap.get(lib);
if ( fSize == null ) {
fSize = new MathUtils.RunningAverage();
fragmentSizeMap.put(lib,fSize);
}
fSize.add(pair.getFragmentSize());
}
}
private TipInfo countTipMismatches(SAMRecord read, int tip_length) {
AlignmentUtils.MismatchCount left_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,0,tip_length);
int right_start = read.getReadLength()-tip_length;
AlignmentUtils.MismatchCount right_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,right_start,read.getReadLength()-right_start);
return new TipInfo(left_mm,right_mm);
}
class TipInfo {
AlignmentUtils.MismatchCount left_mm;
AlignmentUtils.MismatchCount right_mm;
double left_avQ;
double right_avQ;
public TipInfo(AlignmentUtils.MismatchCount l,AlignmentUtils.MismatchCount r) {
left_mm = l;
right_mm = r;
left_avQ = (l.numMismatches ==0 ? 0 : ((double)l.mismatchQualities)/l.numMismatches );
right_avQ = (r.numMismatches ==0 ? 0 : ((double)r.mismatchQualities)/r.numMismatches );
}
public int leftMM() { return left_mm.numMismatches; }
public int rightMM() { return right_mm.numMismatches; }
public double leftQ() { return left_avQ; }
public double rightQ() { return right_avQ; }
}
}

View File

@ -363,7 +363,13 @@ public class PairMaker extends CommandLineProgram {
}
}
if ( best.size() == 0 ) return null; // no unique alignment
if ( best.size() > 1 ) throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
if ( best.size() > 1 ) {
for ( SAMRecord r : best ) {
System.out.println("READ "+r.getReadName()+" mapQ="+r.getMappingQuality()+" at="+r.getReferenceName()+
":"+r.getAlignmentStart()+"("+(r.getReadNegativeStrandFlag()?"-":"+")+") cig="+r.getCigarString());
}
throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
}
return best.get(0);
}

View File

@ -0,0 +1,171 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.interval;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Oct 7, 2010
* Time: 2:40:02 PM
* To change this template use File | Settings | File Templates.
*/
/** This class provides an adapter to Iterator<GenomeLoc> that returns only (parts of) underlying iterator's
* intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return
* NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by
* another interval iterator, it should return sorted and NON-overlapping intervals.
*
*/
public class OverlappingIntervalIterator implements Iterator<GenomeLoc> {
PushbackIterator<GenomeLoc> iter = null;
PushbackIterator<GenomeLoc> boundBy = null;
GenomeLoc prefetchedOverlap = null;
GenomeLoc currentBound = null;
GenomeLoc currentInterval = null;
/** Creates new overlapping iterator that will internally traverse <code>intervals</code> and return only
* overlaps of those with set of intervals returned by <code>boundBy</code>.
* @param intervals
* @param boundBy
*/
public OverlappingIntervalIterator(Iterator<GenomeLoc> intervals, Iterator<GenomeLoc> boundBy) {
this.iter = new PushbackIterator(intervals);
this.boundBy = new PushbackIterator(boundBy);
if ( iter.hasNext() && boundBy.hasNext() ) {
GenomeLoc currentInterval = iter.next(); // load first interval
GenomeLoc currentBound = boundBy.next(); // load first bounding interval
fetchNextOverlap();
}
}
/** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found
* until the end of the either of the two streams, leaves prefetchedOverlap set to null
*/
private void fetchNextOverlap() {
prefetchedOverlap = null;
while ( prefetchedOverlap == null ) {
if ( currentInterval.isBefore(currentBound) ) {
if ( ! iter.hasNext() ) break; // no more intervals left; we are done
currentInterval = iter.next();
continue;
}
if ( currentInterval.isPast(currentBound) ) {
if ( ! boundBy.hasNext() ) break; // we are past the last available bounding interval, we are done!
currentBound = boundBy.next();
continue;
}
// we are at this point only if currentInterval overlaps with currentBound
prefetchedOverlap = currentInterval.intersect(currentBound);
// we still do not know if we are done with either current interval or current bound, because
// two special situations are possible:
//
// 1) next interval overlaps with 2) current interval also overlaps with
// the same bounding interval; next bounding interval; note that
// note that in this case next in this case next bound necessarily
// interval necessarily starts before starts before the next interval
// the next bound
//
// curr. int next int. curr. int
// ----- ------ --------------------------
// ------------------- --------- -------------
// curr. bound curr. bound next bound
// To solve this issue we update either only currentInterval or only currentBound to their next value,
// whichever comes first; the rest of the traversal to the next overlap will be performed on the next invocation of
// fetchNextOverlap().
if ( ! iter.hasNext() ) {
}
GenomeLoc nextInterval = iter.next();
GenomeLoc nextBound = boundBy.next();
if ( nextInterval.startsBefore(nextBound)) {
currentInterval = nextInterval;
boundBy.pushback(nextBound); // in case next interval overlaps with the current bound
} else {
currentBound = nextBound;
iter.pushback(nextInterval); // in case current interval also overlaps with the next bound
}
}
}
/**
* Returns <tt>true</tt> if the iteration has more elements. (In other
* words, returns <tt>true</tt> if <tt>next</tt> would return an element
* rather than throwing an exception.)
*
* @return <tt>true</tt> if the iterator has more elements.
*/
public boolean hasNext() {
return false; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Returns the next element in the iteration.
*
* @return the next element in the iteration.
* @throws java.util.NoSuchElementException
* iteration has no more elements.
*/
public GenomeLoc next() {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Removes from the underlying collection the last element returned by the
* iterator (optional operation). This method can be called only once per
* call to <tt>next</tt>. The behavior of an iterator is unspecified if
* the underlying collection is modified while the iteration is in
* progress in any way other than by calling this method.
*
* @throws UnsupportedOperationException if the <tt>remove</tt>
* operation is not supported by this Iterator.
* @throws IllegalStateException if the <tt>next</tt> method has not
* yet been called, or the <tt>remove</tt> method has already
* been called after the last call to the <tt>next</tt>
* method.
*/
public void remove() {
throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator");
//To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -41,7 +41,7 @@ import java.util.Arrays;
public class AlignmentUtils {
private static class MismatchCount {
public static class MismatchCount {
public int numMismatches = 0;
public long mismatchQualities = 0;
}
@ -98,13 +98,13 @@ public class AlignmentUtils {
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
}
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
}
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
// todo -- high performance implementation. We can do a lot better than this right now
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
MismatchCount mc = new MismatchCount();
int readIdx = 0;
@ -387,6 +387,15 @@ public class AlignmentUtils {
return true;
}
/** Returns true is read is mapped and mapped uniquely (Q>0).
*
* @param read
* @return
*/
public static boolean isReadUniquelyMapped(SAMRecord read) {
return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
}
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array