Improved scatter contigs algorithm to be fairer when splitting a large number of contigs into a small number of parts.
See also: http://gatkforums.broadinstitute.org/discussion/comment/16010 Signed-off-by: Khalid Shakir <kshakir@broadinstitute.org>
This commit is contained in:
parent
edaa78dea1
commit
a62dc84795
|
|
@ -362,29 +362,95 @@ public class IntervalUtils {
|
||||||
* @param scatterParts The output interval lists to write to.
|
* @param scatterParts The output interval lists to write to.
|
||||||
*/
|
*/
|
||||||
public static void scatterContigIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<File> scatterParts) {
|
public static void scatterContigIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<File> scatterParts) {
|
||||||
IntervalList intervalList = null;
|
|
||||||
int fileIndex = -1;
|
// Contract: must divide locs up so that each of scatterParts gets a sublist such that:
|
||||||
int locIndex = 0;
|
// (a) all locs concerning a particular contig go to the same part
|
||||||
String contig = null;
|
// (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs)
|
||||||
for (GenomeLoc loc: locs) {
|
|
||||||
// If there are still more files to write and the contig doesn't match...
|
// Locs are already sorted.
|
||||||
if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) {
|
|
||||||
// Then close the current file and start a new one.
|
long totalBases = 0;
|
||||||
if (intervalList != null) {
|
for(GenomeLoc loc : locs)
|
||||||
intervalList.write(scatterParts.get(fileIndex));
|
totalBases += loc.getStop() - loc.getStart();
|
||||||
intervalList = null;
|
|
||||||
}
|
long idealBasesPerPart = totalBases / scatterParts.size();
|
||||||
fileIndex++;
|
if(idealBasesPerPart == 0)
|
||||||
contig = loc.getContig();
|
throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size()));
|
||||||
}
|
|
||||||
if (intervalList == null)
|
// Find the indices in locs where we switch from one contig to the next.
|
||||||
intervalList = new IntervalList(fileHeader);
|
ArrayList<Integer> contigStartLocs = new ArrayList<Integer>();
|
||||||
intervalList.add(toInterval(loc, ++locIndex));
|
String prevContig = null;
|
||||||
}
|
|
||||||
if (intervalList != null)
|
for(int i = 0; i < locs.size(); ++i) {
|
||||||
intervalList.write(scatterParts.get(fileIndex));
|
|
||||||
if ((fileIndex + 1) != scatterParts.size())
|
GenomeLoc loc = locs.get(i);
|
||||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
|
if(prevContig == null || !loc.getContig().equals(prevContig))
|
||||||
|
contigStartLocs.add(i);
|
||||||
|
prevContig = loc.getContig();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if(contigStartLocs.size() < scatterParts.size())
|
||||||
|
throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size()));
|
||||||
|
|
||||||
|
long thisPartBases = 0;
|
||||||
|
int partIdx = 0;
|
||||||
|
IntervalList outList = new IntervalList(fileHeader);
|
||||||
|
|
||||||
|
for(int i = 0; i < locs.size(); ++i) {
|
||||||
|
|
||||||
|
GenomeLoc loc = locs.get(i);
|
||||||
|
thisPartBases += loc.getStop() - loc.getStart();
|
||||||
|
|
||||||
|
outList.add(toInterval(loc, i));
|
||||||
|
|
||||||
|
boolean partMustStop = false;
|
||||||
|
|
||||||
|
if(partIdx < (scatterParts.size() - 1)) {
|
||||||
|
|
||||||
|
// If there are n contigs and n parts remaining then we must split here,
|
||||||
|
// otherwise we will run out of contigs.
|
||||||
|
|
||||||
|
int nextPart = partIdx + 1;
|
||||||
|
int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size()));
|
||||||
|
if(i + 1 == nextPartMustStartBy)
|
||||||
|
partMustStop = true;
|
||||||
|
|
||||||
|
}
|
||||||
|
else if(i == locs.size() - 1) {
|
||||||
|
|
||||||
|
// We're done! Write the last scatter file.
|
||||||
|
partMustStop = true;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if(partMustStop || thisPartBases > idealBasesPerPart) {
|
||||||
|
|
||||||
|
// Ideally we would split here. However, we must make sure to do so
|
||||||
|
// on a contig boundary. Test always passes with partMustStop == true
|
||||||
|
// since that indicates we're at a contig boundary.
|
||||||
|
|
||||||
|
GenomeLoc nextLoc = null;
|
||||||
|
if((i + 1) < locs.size())
|
||||||
|
nextLoc = locs.get(i+1);
|
||||||
|
|
||||||
|
if(nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) {
|
||||||
|
|
||||||
|
// Write out this part:
|
||||||
|
outList.write(scatterParts.get(partIdx));
|
||||||
|
|
||||||
|
// Reset. If this part ran long, leave the excess in thisPartBases
|
||||||
|
// and the next will be a little shorter to compensate.
|
||||||
|
outList = new IntervalList(fileHeader);
|
||||||
|
thisPartBases -= idealBasesPerPart;
|
||||||
|
++partIdx;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue