Initial graph-based reference implementation and alignment assessor. Not suitable for public use

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1921 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-10-27 21:54:47 +00:00
parent 31d143a841
commit 68fa6da788
3 changed files with 802 additions and 0 deletions

View File

@ -0,0 +1,252 @@
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.graph.SimpleGraph;
import org.jgrapht.graph.SimpleDirectedGraph;
import org.apache.log4j.Logger;
import java.util.*;
import java.io.*;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.util.StringUtil;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
public class GraphReferenceAssessor extends ReadWalker<Integer, Integer> {
@Argument(fullName="graphFile", shortName="GF", doc="", required=true)
String graphFile = null;
ObjectInputStream graphSerialStream = null;
@Argument(fullName="MAX", shortName="MAX", doc="", required=false)
int MAXREADS = -1;
@Argument(fullName="ignore0MM", shortName="I0", doc="", required=false)
boolean IGNORE_0_MM = false;
@Argument(fullName="DEBUG", shortName="DB", doc="", required=false)
int DEBUG_LEVEL = 0;
static boolean DEBUG = false;
static boolean DEBUG2 = false; // crazy level
@Argument(fullName="read", doc="", required=false)
String onlyDoRead = null;
ReferenceGraph graphRef = null;
public void initialize() {
super.initialize();
DEBUG = DEBUG_LEVEL > 0;
DEBUG2 = DEBUG_LEVEL > 1; // crazy level
try {
logger.info("Reading graph reference " + graphFile );
graphSerialStream = new ObjectInputStream( new FileInputStream( graphFile ) );
graphRef = (ReferenceGraph)graphSerialStream.readObject();
graphRef.setDebugPrinting(DEBUG);
graphRef.validateGraph();
logger.info(graphRef.toBriefString());
} catch ( FileNotFoundException e ) {
throw new StingException("Couldn't open file " + graphFile, e);
} catch ( IOException e ) {
throw new StingException("Couldn't write to file " + graphFile, e);
} catch ( ClassNotFoundException e ) {
throw new StingException("Couldn't read ReferenceGraph from file " + graphFile, e);
}
}
private static MismatchCounter countMismatches(byte[] ref, int refOffset, byte[] bases, byte[] quals, int basesOffset, int length) {
MismatchCounter mm = new MismatchCounter();
for ( int i = 0; i < length; i++ ) {
byte rawRefBase = ref[i + refOffset];
byte rawReadBase = bases[i + basesOffset];
int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawRefBase);
int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase);
boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase;
if ( mmP ) {
mm.nMM++;
mm.qSum += quals != null ? quals[i + basesOffset] : 0;
}
if ( GraphReferenceAssessor.DEBUG2 )
System.out.printf("%s%d %c %c %s %b%n", Utils.dupString(' ', basesOffset + 2), basesOffset, (char)rawRefBase, (char)rawReadBase, mm, mmP);
}
return mm;
}
private static MismatchCounter countMismatches(byte[] ref, byte[] bases, byte[] quals) {
return countMismatches(ref, 0, bases, quals, 0, bases.length);
}
private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Collection<Fragment> frags, int fragOffset, byte[] bases, byte[] quals, int readOffset ) {
if ( frags.size() == 0 )
throw new RuntimeException("Fragment list is empty!");
MismatchCounter minNMM = MismatchCounter.MAX_VALUE;
for ( Fragment next : frags ) {
MismatchCounter recNMM = countMismatchesOnGraph( graph, next, 0, bases, quals, readOffset );
minNMM = minNMM.min( recNMM );
}
return minNMM;
}
private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Fragment frag, int fragOffset, byte[] bases, byte[] quals, int readOffset ) {
if ( GraphReferenceAssessor.DEBUG )System.out.printf("%sfrag %s -> %d%n", Utils.dupString(' ', readOffset + 2), frag, readOffset);
MismatchCounter mm = new MismatchCounter();
if ( readOffset < bases.length ) {
int nRemainingBases = bases.length - readOffset;
int cmpLength = frag.getBaseLengthFrom(fragOffset, nRemainingBases); // how many bases over in the fragment are we from the offset
MismatchCounter fragMM = countMismatches(frag.getUnderlyingBases(), frag.getUnderlyingOffset() + fragOffset, bases, quals, readOffset, cmpLength);
mm.add(fragMM);
// // still have some counting to do
// for ( int i = 0; i < baseLength; i++ ) {
// int fragBaseOffset = fragOffset + i;
// int readBaseOffset = readOffset + i;
//
// byte rawFragBase = frag.getBase(fragBaseOffset);
// byte rawReadBase = bases[readBaseOffset];
// int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawFragBase);
// int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase);
//
// boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase;
// if ( mmP ) nMM++;
if ( nRemainingBases > cmpLength ) {
MismatchCounter recMM = countMismatchesOnGraph( graph, graph.outgoingFragments(frag), 0, bases, quals, readOffset + cmpLength );
mm.add(recMM);
}
}
if ( GraphReferenceAssessor.DEBUG ) System.out.printf("%s=> %s%n", Utils.dupString(' ', readOffset + 2), mm);
return mm;
}
private static MismatchCounter countMismatchesOnGraph(ReferenceGraph graph, SAMRecord read) {
if ( GraphReferenceAssessor.DEBUG ) System.out.printf("countMismatchesOnGraph( read=%s%n", read.getReadName());
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
MismatchCounter minNMM = MismatchCounter.MAX_VALUE;
for ( Fragment frag : graph.getStartingFragment(loc) ) {
int fragOffset = frag.getFragOffsetFrom(loc); // how many bases over in the fragment are we from the offset
if ( GraphReferenceAssessor.DEBUG )
System.out.printf(" countMismatchesOnGraph frag=%s loc=%s bases=%s offset=%d%n", frag, loc, read.getReadString(), fragOffset);
MismatchCounter recNMM = countMismatchesOnGraph(graph, frag, fragOffset, read.getReadBases(), read.getBaseQualities(), 0);
minNMM = minNMM.min( recNMM );
}
return minNMM;
}
public Integer map(char[] refArg, SAMRecord read) {
if ( MAXREADS-- == 0 ) {
System.exit(0);
} else if ( onlyDoRead != null && ! read.getReadName().equals(onlyDoRead) ) {
;
} else if ( ! read.getReadUnmappedFlag() && read.getCigar().numCigarElements() == 1 ) {
try {
byte[] ref = BaseUtils.charSeq2byteSeq(refArg);
// we're all XM
int nMMFromRead = (Short)read.getAttribute("NM");
MismatchCounter nAlignedMM = countMismatches(ref, read.getReadBases(), read.getBaseQualities());
if ( ! IGNORE_0_MM || nAlignedMM.nMM > 0 ) {
MismatchCounter nGraphMM = countMismatchesOnGraph(graphRef, read);
MismatchCounter deltaMM = nAlignedMM.minus(nGraphMM);
out.printf("%50s with %5s at %10s: mismatches: %3d (delta %3d) -- %3d %3d -- %3d %3d -- delta %3d %3d%n",
read.getReadName(), read.getCigarString(), GenomeLocParser.createGenomeLoc(read),
nMMFromRead, nMMFromRead - nAlignedMM.nMM,
nAlignedMM.nMM, nAlignedMM.qSum,
nGraphMM.nMM, nGraphMM.qSum,
deltaMM.nMM, deltaMM.qSum);
if ( deltaMM.nMM < 0 || deltaMM.qSum < 0 )
throw new StingException(read.getReadName() + " is miscalculated");
}
} catch ( Exception e ) {
System.out.printf("Exception at %s at %s%n", read.getReadName(), GenomeLocParser.createGenomeLoc(read));
throw new RuntimeException(e);
}
} else {
; // don't do anything
}
return 0;
}
/**
* reduceInit is called once before any calls to the map function. We use it here to setup the output
* bam file, if it was specified on the command line
*
* @return
*/
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer readScore, Integer data) {
return data + readScore;
}
public void onTraversalDone(Integer data) {
out.printf(data.toString());
}
}
class MismatchCounter {
int nMM = 0;
int qSum = 0;
public static MismatchCounter MAX_VALUE = new MismatchCounter(Integer.MAX_VALUE, Integer.MAX_VALUE );
public MismatchCounter() {}
public MismatchCounter(int nMM, int qSum) {
this.nMM = nMM;
this.qSum = qSum;
}
public void add(MismatchCounter that) {
this.nMM += that.nMM;
this.qSum += that.qSum;
}
public MismatchCounter min(MismatchCounter that) {
int cmpQSum = Integer.valueOf(this.qSum).compareTo(that.qSum);
if ( cmpQSum < 0 ) { return this; }
else if ( cmpQSum > 0 ) { return that; }
else if ( this.nMM < that.nMM ) { return this; }
else if ( this.nMM > that.nMM ) { return that; }
else { return this; }
}
public MismatchCounter minus(MismatchCounter that) {
return new MismatchCounter(this.nMM - that.nMM, this.qSum - that.qSum);
}
public String toString() { return String.format("[MM %d %d]", nMM, qSum); }
}

View File

@ -0,0 +1,130 @@
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.graph.SimpleGraph;
import org.jgrapht.graph.SimpleDirectedGraph;
import org.apache.log4j.Logger;
import java.util.*;
import java.io.ObjectOutputStream;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.util.StringUtil;
@WalkerName("GraphReferenceBuilder")
@Requires(value={DataSource.REFERENCE})
public class GraphReferenceBuilder extends RefWalker<Integer, Integer> {
@Argument(fullName="graphFile", shortName="GF", doc="", required=true)
String graphFile = null;
@Argument(fullName="DEBUG", shortName="DB", doc="", required=false)
boolean DEBUG = false;
@Argument(fullName="VALIDATE", shortName="VD", doc="", required=false)
boolean VALIDATE_GRAPH = false;
@Argument(fullName="printFrequency", shortName="F", doc="", required=false)
int printFrequency = 10000;
ObjectOutputStream graphSerialStream = null;
ReferenceGraph graphRef = null;
ReferenceSequenceFile flatReferenceFile = null;
public void initialize() {
super.initialize();
graphRef = new ReferenceGraph(DEBUG);
try {
graphSerialStream = new ObjectOutputStream( new FileOutputStream( graphFile ) );
} catch ( FileNotFoundException e ) {
throw new StingException("Couldn't open file " + graphFile, e);
} catch ( IOException e ) {
throw new StingException("Couldn't write to file " + graphFile, e);
}
flatReferenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.getToolkit().getArguments().referenceFile);
ReferenceSequence refSeq = flatReferenceFile.nextSequence();
do {
//logger.info("Read " + refSeq);
graphRef.bindRefenceSequence(refSeq);
logger.info(String.format("contig %s has length %d", refSeq.getName(), refSeq.length()));
refSeq = flatReferenceFile.nextSequence();
} while ( refSeq != null );
System.out.println(graphRef.toBriefString());
}
int counter = printFrequency;
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {
// if ( context.getLocation().getStart() == 59384 ) {
// try {
// Thread.currentThread().sleep(5000);
// } catch (InterruptedException e) {
// ;
// }
// }
boolean alreadyAddedAtThisLoc = false;
for ( ReferenceOrderedDatum rod : rodData.getAllRods() ) {
if ( rod instanceof Variation && ! alreadyAddedAtThisLoc ) {
// if we have multiple variants at a locus, just take the first damn one we see for now
Variation variant = (Variation) rod;
// todo -- getAlternativeBases should be getAlleles()
GenomeLoc loc = variant.getLocation();
String[] allAllelesList = null; // variant.getAlternateBases().split(""); // todo fixme
if ( allAllelesList.length >= 3 ) { // bad dbSNP format :-(
List<String> alleles = Arrays.asList(allAllelesList).subList(1,3);
//logger.info(String.format("Adding %s %s", loc, alleles));
graphRef.addVariation(variant, loc, alleles);
//logger.info(String.format(" Added %s %s", loc, alleles));
alreadyAddedAtThisLoc = true;
if ( counter-- == 0 ) {
logger.info(String.format("Added %s %s %s", loc, alleles, graphRef.toBriefString()));
counter = printFrequency;
if ( VALIDATE_GRAPH )
graphRef.validateGraph();
}
}
}
}
return null;
}
// todo -- graph should be the reduce result
public Integer reduceInit() {
return null;
}
public Integer reduce(Integer value, Integer sum) {
return sum;
}
public void onTraversalDone(Integer sum) {
super.onTraversalDone(sum);
try {
graphSerialStream.writeObject(graphRef);
graphSerialStream.close();
} catch ( IOException e ) {
throw new StingException("Couldn't write to file " + graphFile, e);
}
}
}

View File

@ -0,0 +1,420 @@
package org.broadinstitute.sting.playground.gatk.walkers.graphalign;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.graph.SimpleDirectedGraph;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import java.util.*;
import java.io.Serializable;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;
import edu.mit.broad.picard.util.IntervalTree;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Oct 22, 2009
* Time: 2:15:54 PM
* To change this template use File | Settings | File Templates.
*/
class ReferenceGraph extends SimpleDirectedGraph<Fragment, DefaultEdge> implements Serializable {
final private static boolean USE_IT = true;
final private static boolean THROW_ERRORS_ON_BAD_INPUTS = false;
private boolean DEBUG = false;
int nSkippedIndels = 0;
int nSkippedBecauseOfContinguousVariation = 0;
int nBadPolymorphisms = 0;
int nMultiStateAlleles = 0;
GenomeLoc initialLoc = null;
private transient IntervalTree<Fragment> loc2Fragment = new IntervalTree<Fragment>();
public ReferenceGraph(boolean printDebuggingInfo) {
super(DefaultEdge.class);
DEBUG = printDebuggingInfo;
}
public ReferenceGraph() {
this(false);
}
public void setDebugPrinting(boolean enable) {
this.DEBUG = enable;
}
public void bindRefenceSequence(ReferenceSequence seq) {
GenomeLoc refSeqLoc = GenomeLocParser.createGenomeLoc(seq.getContigIndex(), 1, seq.length());
String refString = StringUtil.bytesToString(seq.getBases()).toUpperCase();
Fragment frag = new Fragment(refSeqLoc, 1, StringUtil.stringToBytes(refString));
addFragment(frag);
initialLoc = refSeqLoc;
}
private void addFragmentToIntervalTree(Fragment frag) {
loc2Fragment.put((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop(), frag);
}
private void addFragment(Fragment frag) {
addFragmentToIntervalTree(frag);
addVertex(frag);
}
private void removeFragment(Fragment frag) {
loc2Fragment.remove((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop());
removeVertex(frag);
}
public void validateGraph() {
for ( Fragment v : this.vertexSet() ) {
if ( this.inDegreeOf(v) == 0 && v.getLocation().getStart() != initialLoc.getStart() ) {
throw new StingException(String.format("Fragment %s has no incoming edges but isn't at the start of the contig %s", v, initialLoc));
}
if ( this.outDegreeOf(v) == 0 && v.getLocation().getStop() != initialLoc.getStop() ) {
throw new StingException(String.format("Fragment %s has no outgoing edges but isn't at the end of the contig %s", v, initialLoc));
}
}
//System.out.printf("Passed validation: %s%n", this.toBriefString());
}
private void rebuildIntervalTree() {
if ( DEBUG ) System.out.printf("rebuilding IntervalTree()%n");
for ( Fragment v : this.vertexSet() ) {
if ( DEBUG ) System.out.printf(" adding interval tree: %s%n", v);
addFragmentToIntervalTree(v);
}
}
private boolean allelesAreInExcisedFragment(Fragment cut, List<String> alleles) {
boolean foundRef = false;
for ( String allele : alleles ) {
if ( allele.equals(cut.getBases()) )
foundRef = true;
}
if ( ! foundRef && THROW_ERRORS_ON_BAD_INPUTS )
throw new StingException(String.format("Polymorphic alleles %s do not contain the reference sequence %s", alleles, cut.getBases()));
return foundRef;
}
public void addVariation(Variation variant, GenomeLoc loc, List<String> alleles) {
if ( DEBUG ) System.out.printf("addVariation(%s, %s)%n", loc, alleles);
//validateGraph();
if ( variant.isSNP() ) {
Fragment frag = getContainingFragment(loc);
if ( frag == null ) {
nMultiStateAlleles++;
return;
}
if ( ! allelesAreInExcisedFragment(subFragment(frag, loc, 1), alleles)) {
nBadPolymorphisms++;
return;
}
List<Fragment> split = exciseFragment(frag, loc);
if ( split != null ) {
Fragment left = split.get(0);
Fragment cut = split.get(1);
Fragment right = split.get(2);
if ( DEBUG ) System.out.printf(" cutFrag(%s, %s)%n", loc, cut);
for ( String allele : alleles ) {
byte[] bases = StringUtil.stringToBytes(allele);
double freq = 1.0 / alleles.size();
Fragment alleleFrag = new Fragment(loc, freq, 0, bases.length, bases);
if ( DEBUG ) System.out.printf(" Creating allele fragment %s%n", alleleFrag);
addFragment(alleleFrag);
if ( left != null ) addEdge(left, alleleFrag);
if ( right != null ) addEdge(alleleFrag, right);
}
} else {
nSkippedBecauseOfContinguousVariation++;
}
} else {
nSkippedIndels++;
}
}
private Fragment subFragment(Fragment frag, GenomeLoc loc, double freq ) {
return new Fragment(loc, 1, frag.getUnderlyingBases());
}
public List<Fragment> exciseFragment(Fragment frag, GenomeLoc loc) {
if ( DEBUG ) System.out.printf(" exciseFragment(%s, %s)%n", frag, loc);
GenomeLoc fragLoc = frag.getLocation();
Fragment cut = subFragment(frag, loc, 1);
Set<DefaultEdge> inToFrag = incomingEdgesOf(frag);
Set<DefaultEdge> outOfFrag = outgoingEdgesOf(frag);
Fragment left = null;
if ( fragLoc.getStart() == loc.getStart() ) {
if ( ! inToFrag.isEmpty() ) {
if ( THROW_ERRORS_ON_BAD_INPUTS )
throw new StingException(String.format("Attempting to create a variation at the start of a fragment %s %s", frag, loc));
return null;
}
} else {
GenomeLoc leftLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), fragLoc.getStart(), loc.getStart()-1);
left = new Fragment(leftLoc, 1, frag.getUnderlyingBases());
addFragment(left);
for ( DefaultEdge e : inToFrag ) {
addEdge(getEdgeSource(e), left);
}
removeAllEdges(inToFrag);
}
Fragment right = null;
if ( fragLoc.getStop() == loc.getStop() ) {
if ( ! outOfFrag.isEmpty() ) {
throw new StingException(String.format("Attempting to create a variation at the end of a fragment %s %s", frag, loc));
}
} else {
GenomeLoc rightLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), loc.getStop()+1, fragLoc.getStop());
right = new Fragment(rightLoc, 1, frag.getUnderlyingBases());
addFragment(right);
for ( DefaultEdge e : outOfFrag ) {
addEdge(right, getEdgeTarget(e));
}
removeAllEdges(outOfFrag);
}
if ( DEBUG ) System.out.printf(" removing %s%n", frag);
removeFragment(frag);
if ( DEBUG ) System.out.printf(" returning left=%s right=%s%n", left, right);
return Arrays.asList(left, cut, right);
}
public Fragment getContainingFragment(GenomeLoc loc) {
Fragment frag = USE_IT ? getContainingFragmentIT(loc) : getContainingFragmentG(loc);
if ( frag == null ) {
if ( THROW_ERRORS_ON_BAD_INPUTS )
throw new StingException("No spanning fragment was found for " + loc);
else
return null;
}
else if ( frag.getLocation().getStart() > loc.getStart() || frag.getLocation().getStop() < loc.getStop() )
throw new StingException("BUG: bad spanning fragment found for " + loc + " was " + frag.getLocation() );
else
return frag;
}
public Fragment getContainingFragmentG(GenomeLoc loc) {
for ( Fragment v : this.vertexSet() ) {
if ( v.getLocation().containsP(loc) ) {
return v;
}
}
return null;
}
public Fragment getContainingFragmentIT(GenomeLoc loc) {
IntervalTree.Node<Fragment> node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStop());
if ( node == null )
return null;
else
return node.getValue();
}
public Collection<Fragment> getStartingFragment(GenomeLoc loc) {
Collection<Fragment> frags = USE_IT ? getStartingFragmentIT(loc) : getStartingFragmentG(loc);
//Collection<Fragment> frags = getStartingFragmentTest(loc);
if ( frags == null || frags.size() == 0 )
throw new StingException("No fragment contains location start of " + loc);
if ( frags.size() == 1 && MathUtils.compareDoubles(frags.iterator().next().getFrequency(), 1.0) != 0 ) {
Fragment bad = frags.iterator().next();
throw new StingException(String.format("Only one fragment was found but it's frequency < 1 %s with %e", bad, bad.getFrequency()));
}
else
return frags;
}
public Collection<Fragment> getStartingFragmentTest(GenomeLoc loc) {
Collection<Fragment> fragsFromIT = getStartingFragmentIT(loc);
Collection<Fragment> fragsFromG = getStartingFragmentG(loc);
if ( fragsFromIT.size() != fragsFromG.size() ) {
throw new StingException(String.format("Fragment sizes differ %d from IntervalTree, %d from graph", fragsFromIT.size(), fragsFromG.size()));
}
return USE_IT && false ? fragsFromIT : fragsFromG;
}
public Collection<Fragment> getStartingFragmentIT(GenomeLoc loc) {
Collection<Fragment> frags = new HashSet<Fragment>();
Iterator<IntervalTree.Node<Fragment>> it = loc2Fragment.overlappers((int)loc.getStart(), (int)loc.getStart());
IntervalTree.Node<Fragment> node = null;
while ( it.hasNext() ) {
node = it.next();
frags.add(node.getValue());
}
// todo -- painful bug work around -- should be removed
if ( frags.size() == 1 && MathUtils.compareDoubles(node.getValue().getFrequency(), 1.0) != 0 ) {
System.out.printf(">>> Using IT workaround at %s <<<%n", loc);
return getStartingFragmentG(loc);
}
return frags;
// IntervalTree.Node<Fragment> node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStart());
// if ( node == null )
// return null;
// else
// return node.getValue();
}
public Collection<Fragment> getStartingFragmentG(GenomeLoc loc) {
Collection<Fragment> frags = new HashSet<Fragment>();
for ( Fragment v : this.vertexSet() ) {
//if ( v.getLocation().getStart() < loc.getStop() )
// System.out.printf("Checking %s vs. %s%n", loc, v.getLocation());
if ( v.getLocation().containsStartPosition(loc.getStart()) ) {
// System.out.printf(" Adding %s%n", v.getLocation());
frags.add(v);
}
}
return frags;
}
public Set<Fragment> outgoingFragments( Fragment frag ) {
Set<Fragment> outgoingFrags = new HashSet<Fragment>();
for ( DefaultEdge e : outgoingEdgesOf(frag) ) {
outgoingFrags.add(getEdgeTarget(e));
}
if ( outgoingFrags.size() == 0 && frag.getLocation().getStop() != initialLoc.getStop() ) {
}
return outgoingFrags;
}
public String toString() {
StringBuilder s = new StringBuilder();
for ( Fragment v : this.vertexSet() ) {
s.append(String.format("Fragment: %s%n", v.toString()));
for ( DefaultEdge e : this.incomingEdgesOf(v) ) {
s.append(String.format(" [IN FROM] %s%n", this.getEdgeSource(e)));
}
for ( DefaultEdge e : this.outgoingEdgesOf(v) ) {
s.append(String.format(" [OUT TO ] %s%n", this.getEdgeTarget(e)));
}
}
return s.toString();
}
public String toBriefString() {
return String.format("GraphRef: %d fragments, %d edges, skipped %d contingous variants, %d indels, %d polymorphisms w/o ref allele, %d multi-state",
this.vertexSet().size(), this.edgeSet().size(), nSkippedBecauseOfContinguousVariation, nSkippedIndels, nBadPolymorphisms, nMultiStateAlleles);
}
// --------------------------------------------------------------------------------------------------------------
//
// serialization
//
// --------------------------------------------------------------------------------------------------------------
private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
//always perform the default de-serialization first
stream.defaultReadObject();
loc2Fragment = new IntervalTree<Fragment>();
rebuildIntervalTree();
}
}
class Fragment implements Serializable {
GenomeLoc loc = null; // Index position of this fragment into the reference
int offset, stop;
double freq = -1;
byte[] bases = null;
public Fragment( GenomeLoc loc, double freq, int offset, int stop, byte[] bases ) {
this.loc = loc;
this.freq = freq;
this.bases = bases;
this.offset = offset;
this.stop = stop;
}
public Fragment( GenomeLoc loc, double freq, byte[] bases ) {
this(loc, freq, (int)loc.getStart()-1, (int)loc.getStop(), bases);
}
public String toString() {
//return String.format("%s:%.2f:%s", loc.toString(), getFrequency(), getBases());
return String.format("%s:%.2f", loc.toString(), getFrequency());
}
public GenomeLoc getLocation() {
return loc;
}
public double getFrequency() {
return freq;
}
public int getUnderlyingOffset() { return offset; }
public int getStop() { return stop; }
public int getLength() { return getStop() - getUnderlyingOffset(); }
public byte[] getUnderlyingBases() {
return bases;
}
/**
* how many bases over in the fragment are we over in this fragment?
*
* @param loc
* @return
*/
public int getFragOffsetFrom(GenomeLoc loc) {
// todo -- ignores contigs -- can we fix this?
if ( getLocation().getStart() > loc.getStart() )
throw new StingException("BUG: Request for offset from " + loc + " in frag at " + getLocation() + " but this is beyond the location of the fragment");
return (int)(loc.getStart() - getLocation().getStart());
}
public int getBaseLengthFrom( int fragOffset, int maxLength ) {
int fragRemaining = getLength() - fragOffset;
if ( fragRemaining < 0 )
throw new StingException("BUG: Request for length from offset " + fragOffset + " but this is longer than the fragment itself");
return Math.min(fragRemaining, maxLength);
}
public byte getBase(int fragOffset) {
return bases[getUnderlyingOffset() + fragOffset];
}
public String getBases() {
return StringUtil.bytesToString(getUnderlyingBases(), getUnderlyingOffset(), getLength());
}
}