Updated python files
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1182 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f6a273a537
commit
f5b00c20d0
|
|
@ -15,7 +15,7 @@ pdf(outfile, height=7, width=7)
|
|||
d.good <- t[t$nMismatches >= 1000,]
|
||||
d.100 <- t[t$nMismatches < 100,]
|
||||
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
|
|
@ -26,6 +26,16 @@ dev.off()
|
|||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
|
||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0)
|
||||
plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
|
|||
|
|
@ -1,113 +0,0 @@
|
|||
Read Quality Recalibrator
|
||||
-------------------------
|
||||
The tools in this package recalibrate quality scores of
|
||||
Illumina reads in an aligned BAM file. After recalibration,
|
||||
the quality scores in the QUAL field in each Illumina read
|
||||
in the output BAM are accurate in that the reported quality
|
||||
score is equal to its actual probability of mismatching.
|
||||
This is process is accomplished by analyzing the covariation
|
||||
between machine reported quality scores and
|
||||
|
||||
1) the position within the read, and
|
||||
2) the preceding nucleotide (sequencing chemistry effect).
|
||||
|
||||
The aligned reads have their dbSNP sites masked out, and the
|
||||
mismatched bases are used as a metric for the true error rate
|
||||
of the system. The error rate at different dinucleotides and
|
||||
positions in the read is then fed into a logistic regression
|
||||
system which outputs a correction factor for each of those
|
||||
combinations which are then use to output a recalibrated BAM
|
||||
file.
|
||||
|
||||
Software Dependencies
|
||||
---------------------
|
||||
The recalibrator currently depends on the following
|
||||
applications. Please install these before proceeding.
|
||||
|
||||
- Java Runtime Environment 1.6.0_12 or later
|
||||
- Python 2.4.2 or later
|
||||
- R 2.6.0 or later
|
||||
- samtools 0.1.4 or later.
|
||||
|
||||
Please update the file paths at the top of
|
||||
RecalQual.py to point to the local installations of the
|
||||
software listed above.
|
||||
|
||||
Running
|
||||
-------
|
||||
Before running the tool, please update the file paths
|
||||
listed at the top of RecalQual.py to point to the
|
||||
local installations of the tools listed above.
|
||||
|
||||
The recalibrator has two modes: recalibration and
|
||||
evaluation, which can be run separately or jointly.
|
||||
By default, the recalibrator will recalibrate only.
|
||||
|
||||
To calibrate a given bam file, run the following
|
||||
command:
|
||||
|
||||
python RecalQual.py <source bam> <recalibrated bam>
|
||||
|
||||
After recalibration, performing evaluation will walk
|
||||
through the source BAM file again, remeasuring the
|
||||
differences between empirical vs. reported quality.
|
||||
The results of evaluation are comma-delimited text
|
||||
files (.csv) and graphs (.png) which can be found
|
||||
in the output directory (see below) for the given run.
|
||||
A successful recalibration should produce a plot of
|
||||
empirical vs. observed qualities that shows little
|
||||
difference between the two values.
|
||||
|
||||
To both recalibrate and evaluate, execute:
|
||||
|
||||
python RecalQual.py --recalibrate --evaluate <source bam> <recalibrated bam>
|
||||
|
||||
To (only) evaluate a given bam file after calibrating:
|
||||
|
||||
python RecalQual.py --evaluate <source bam> <recalibrated bam>
|
||||
|
||||
Platforms
|
||||
---------
|
||||
By default, the recalibrator processes only read groups
|
||||
originating from Illumina sequencers. To enable calibration
|
||||
for other platforms, edit the 'platforms' array at the
|
||||
top of RecalQual.py. Platforms specified here should
|
||||
case-insensitive match the "PL" attribute of the read
|
||||
group in the BAM file.
|
||||
|
||||
Output
|
||||
------
|
||||
The recalibration process keeps many incremental
|
||||
files around for future analysis. By default, all
|
||||
of these files will be grouped into the directory
|
||||
'output.<source bam>/'. By default, this directory
|
||||
will be created within the working directory. This
|
||||
location can be changed by editing the output_root
|
||||
variable at the top of RecalQual.py.
|
||||
|
||||
It is safe to delete this supplemental output
|
||||
directory at any time.
|
||||
|
||||
Known Issues
|
||||
------------
|
||||
- The recalibrator places severe memory demands on
|
||||
files with large numbers of read groups.
|
||||
- If running in 'evaluation' mode (see the 'Running'
|
||||
section above), X11 is required to generate the
|
||||
graphs. If running on a machine via ssh, be certain
|
||||
to enable X tunnelling.
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
- The memory requirements of the recalibrator will
|
||||
vary based on the type of JVM running the application
|
||||
and the number of read groups in the input bam file.
|
||||
If the application reports 'java.lang.OutOfMemoryError:
|
||||
Java heap space', increase the max heap size provided
|
||||
to the JVM by adding ' -Xmx????m' to the jvm_args
|
||||
variable in RecalQual.py, where '????' is the maximum
|
||||
available memory on the processing computer.
|
||||
|
||||
Support
|
||||
-------
|
||||
For support, please email gsadevelopers@broad.mit.edu.
|
||||
|
|
@ -13,7 +13,10 @@ ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
|||
|
||||
def geli2dbsnpFile(geli):
|
||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
if ext == ".calls":
|
||||
return None
|
||||
else:
|
||||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
|
||||
def SingleSampleGenotyperCmd(bam, geli, use2bp):
|
||||
naid = os.path.split(bam)[1].split(".")[0]
|
||||
|
|
@ -49,7 +52,7 @@ def gelis2gelisText( gelis ):
|
|||
if os.path.splitext(maybeGeli)[1] == ".calls" :
|
||||
return maybeGeli
|
||||
else:
|
||||
return os.path.split(geli)[1] + '.calls'
|
||||
return os.path.split(maybeGeli)[1] + '.calls'
|
||||
|
||||
return map( geli2geliText, gelis)
|
||||
|
||||
|
|
@ -84,7 +87,7 @@ def main():
|
|||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 2:
|
||||
parser.error("incorrect number of arguments: " + str(args))
|
||||
lines = [line.split() for line in open(args[0])]
|
||||
lines = [line.strip().split() for line in open(args[0])]
|
||||
nIndividuals = int(args[1])
|
||||
|
||||
outputFile = open(OPTIONS.output, 'w')
|
||||
|
|
@ -108,7 +111,7 @@ def main():
|
|||
|
||||
for geli, jobid in zip(gelis, jobids):
|
||||
dbsnpFile = geli2dbsnpFile(geli)
|
||||
if not os.path.exists(dbsnpFile):
|
||||
if dbsnpFile == None and not os.path.exists(dbsnpFile):
|
||||
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||
if jobid == 0: jobid = None
|
||||
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
|
@ -151,14 +154,14 @@ def main():
|
|||
sortedCalls = [line.split() for line in open(sortedCallFile)]
|
||||
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
|
||||
|
||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
|
||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
|
||||
|
||||
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
|
||||
if snp == None: continue # ignore bad calls
|
||||
#print snp
|
||||
#sharedCalls = list(sharedCallsGroup)
|
||||
#genotype = list(sharedCalls[0][5])
|
||||
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||
print >> outputFile, '%s %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||
outputFile.close()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from itertools import *
|
|||
import math
|
||||
import operator
|
||||
|
||||
MAX_QUAL_SCORE = 50
|
||||
MAX_QUAL_SCORE = 40
|
||||
|
||||
def phredQScore( nMismatches, nBases ):
|
||||
"""Calculates a phred-scaled score for nMismatches in nBases"""
|
||||
|
|
@ -293,19 +293,25 @@ def qReportedVsqEmpiricalStream(readGroup, data):
|
|||
yield key, datum
|
||||
|
||||
def qReportedVsqEmpirical(readGroup, allData, output):
|
||||
print >> output, 'Qreported Qempirical nMismatches nBases'
|
||||
print >> output, 'Qreported Qempirical nMismatches nBases PercentBases'
|
||||
rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0]
|
||||
for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
|
||||
#if datum.qReported() > 35:
|
||||
# print datum
|
||||
print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
|
||||
print >> output, "%2.2f %2.2f %12d %12d %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases()))
|
||||
|
||||
def analyzeRawData(rawDataFile):
|
||||
nReadGroups = 0
|
||||
for readGroup, data in rawDataByReadGroup(rawDataFile):
|
||||
if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
|
||||
root, sourceFilename = os.path.split(rawDataFile)
|
||||
if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
|
||||
outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
|
||||
analyzeReadGroup(readGroup, data, outputRoot)
|
||||
nReadGroups += 1
|
||||
if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1:
|
||||
break
|
||||
else:
|
||||
root, sourceFilename = os.path.split(rawDataFile)
|
||||
if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
|
||||
outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
|
||||
analyzeReadGroup(readGroup, data, outputRoot)
|
||||
|
||||
plottersByFile = {
|
||||
"raw_data.csv$" : analyzeRawData,
|
||||
|
|
@ -344,6 +350,9 @@ def main():
|
|||
parser.add_option("-d", "--dir", dest="outputDir",
|
||||
type="string", default=None,
|
||||
help="If provided, analysis output files will be written to this directory")
|
||||
parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups",
|
||||
type="int", default=-1,
|
||||
help="Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed")
|
||||
parser.add_option("-c", "--config", dest="configs",
|
||||
action="append", type="string", default=[],
|
||||
help="Configuration file")
|
||||
|
|
@ -371,7 +380,8 @@ def main():
|
|||
parser.error("Requires at least one configuration file be provided")
|
||||
|
||||
config = gatkConfigParser(OPTIONS.configs)
|
||||
|
||||
|
||||
if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups
|
||||
analyzeFiles(args)
|
||||
|
||||
import unittest
|
||||
|
|
@ -400,4 +410,4 @@ class TestanalzyeRecalQuals(unittest.TestCase):
|
|||
if __name__ == '__main__':
|
||||
main()
|
||||
#unittest.main()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ if __name__ == "__main__":
|
|||
type="string", default="",
|
||||
help="arguments to GATK")
|
||||
parser.add_option("-m", "--mode", dest="RecalMode",
|
||||
type="string", default="",
|
||||
type="string", default="SEQUENTIAL",
|
||||
help="Mode argument to provide to table recalibrator")
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
|
|
@ -22,14 +22,14 @@ if __name__ == "__main__":
|
|||
action="append", type="string", default=[],
|
||||
help="Configuration file")
|
||||
parser.add_option("-d", "--dir", dest="scratchDir",
|
||||
type="string", default="test",
|
||||
type="string", default="./",
|
||||
help="Output directory")
|
||||
parser.add_option("-w", "--wait", dest="initialWaitID",
|
||||
type="string", default=None,
|
||||
help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("", "--plot", dest="plot",
|
||||
action='store_true', default=False,
|
||||
help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files")
|
||||
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=False,
|
||||
help="Ignores already written files, if present")
|
||||
|
|
@ -54,10 +54,10 @@ if __name__ == "__main__":
|
|||
|
||||
def covariateCmd(bam, outputDir):
|
||||
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
|
||||
return config.gatkCmd('CountCovariates') + add
|
||||
return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add
|
||||
|
||||
def recalibrateCmd(inputBAM, dataFile, outputBAM):
|
||||
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
|
||||
return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
|
||||
|
||||
def runCovariateCmd(inputBAM, dataFile, dir, jobid):
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
|
||||
|
|
@ -67,7 +67,7 @@ if __name__ == "__main__":
|
|||
#
|
||||
# Actually do some work here
|
||||
#
|
||||
jobid = None
|
||||
jobid = OPTIONS.initialWaitID
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile):
|
||||
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
|
||||
|
||||
|
|
@ -77,4 +77,4 @@ if __name__ == "__main__":
|
|||
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
|||
cmd_str += " -o " + farm_stdout
|
||||
|
||||
if waitID <> None:
|
||||
cmd_str += " -w \"done(%s)\"" % (str(waitID))
|
||||
cmd_str += " -w \"ended(%s)\"" % (str(waitID))
|
||||
|
||||
cmd_str += " \""+cmd_str_from_user + "\""
|
||||
|
||||
|
|
|
|||
|
|
@ -71,9 +71,16 @@ class gatkConfigParser(ConfigParser.SafeConfigParser):
|
|||
def gatk_args(self): return self.getOption(self.GATK, 'args')
|
||||
def reference(self): return self.getOption(self.GATK, 'reference')
|
||||
|
||||
def gatkCmd(self, mode):
|
||||
def gatkCmd(self, mode, log = None, stdLogName=False):
|
||||
cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()])
|
||||
cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)])
|
||||
if log <> None:
|
||||
if stdLogName:
|
||||
#head, ext = os.path.splitext(log)
|
||||
logName = log + "." + mode + ".log"
|
||||
else:
|
||||
logName = log
|
||||
cmd += ' ' + ' '.join(['-log', logName])
|
||||
return cmd
|
||||
|
||||
import unittest
|
||||
|
|
|
|||
|
|
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
|||
MSDStr = ''
|
||||
if MSD: MSDStr = 'MSD=true'
|
||||
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
|
|
|
|||
Loading…
Reference in New Issue