Simple python script for generating a firehose-parseable text file from MS-Dos formatted TSV spreadsheets (of the type that we get from project management). Will be deprecated in a few weeks with the advent of direct BSP ID entries, but useful until then.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2820 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c66861746a
commit
d16a1b5645
|
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
bamfile_base = "/seq/picard_aggregation/"
|
||||
fingerprint_base = "/seq/references/reference_genotypes/non-hapmap/"
|
||||
hg18_reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
hg18_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_hg18.rod"
|
||||
b36_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_b36.rod"
|
||||
b36_reference = "/broad/1KG/reference/human_b36_both.fasta"
|
||||
hg18_intervals = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list"
|
||||
b36_intervals = ""
|
||||
|
||||
min_base_q = "10"
|
||||
min_map_q = "10"
|
||||
max_reads = "1000000"
|
||||
min_conf = "0"
|
||||
|
||||
spreadsheetPath = sys.argv[3]
|
||||
projectName = sys.argv[2]
|
||||
groupName = sys.argv[1]
|
||||
reference = sys.argv[4]
|
||||
|
||||
if ( reference != "hg18" and reference != "b36" ):
|
||||
raise ValueError("Illegal reference type")
|
||||
elif ( reference == "hg18" ):
|
||||
reference = hg18_reference
|
||||
dbsnp = hg18_dbsnp
|
||||
intervals = hg18_intervals
|
||||
fpref = "Homo_sapiens_assembly18"
|
||||
else:
|
||||
reference = b36_reference
|
||||
dbsnp = b36_dbsnp
|
||||
intervals = b36_intervals
|
||||
fpref = "human_b36"
|
||||
|
||||
outputFile = projectName+"_bam_files.txt"
|
||||
OUTPUT_HEADER = ["sample_id","recalbrated_bam_file","individual_id","fingerprint_file","reference_file","dbsnp_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality"]
|
||||
|
||||
if ( spreadsheetPath.find("/") > -1 ):
|
||||
newSpreadsheet = spreadsheetPath.rsplit("/",1)[1].rsplit(".",1)[0]+"_proper_format.tsv"
|
||||
else:
|
||||
newSpreadsheet = spreadsheetPath.rsplit(".",1)[0]+"_proper_format.tsv"
|
||||
|
||||
# convert to proper format
|
||||
os.system("sed 's/\\r/\\n/g' "+spreadsheetPath+" > "+newSpreadsheet)
|
||||
|
||||
project_info = open(newSpreadsheet)
|
||||
|
||||
header = project_info.readline().strip().split("\t")
|
||||
|
||||
project_index = header.index("Project")
|
||||
sample_index = header.index("Sample")
|
||||
|
||||
def versionCompare(version1,version2):
|
||||
return -int(version1.split("v")[1])+int(version2.split("v")[1])
|
||||
|
||||
def getNewestVersion(baseDir):
|
||||
versions = os.listdir(baseDir)
|
||||
versions.sort(versionCompare)
|
||||
for version in versions:
|
||||
if ( "finished.txt" in os.listdir(baseDir+version+"/") ):
|
||||
return version
|
||||
|
||||
outputFile = open(outputFile,'w')
|
||||
outputFile.write("\t".join(OUTPUT_HEADER)+"\n")
|
||||
|
||||
for line in project_info.readlines():
|
||||
if ( not line.startswith("\n") and not line.startswith(" ") and not line.startswith("\t") ):
|
||||
spline = line.strip().split("\t")
|
||||
versioningDirectory = bamfile_base+spline[project_index]+"/"+spline[sample_index]+"/"
|
||||
version = getNewestVersion(versioningDirectory)
|
||||
bamfile = versioningDirectory+version+"/"+spline[sample_index]+".bam"
|
||||
fingerprint_file = fingerprint_base+spline[project_index]+"/"+fpref+"/"+spline[sample_index]+".fingerprint.geli"
|
||||
outputFile.write(projectName+"_"+spline[sample_index]+"\t"+bamfile+"\t"+groupName+"\t"+fingerprint_file+"\t"+reference+"\t"+dbsnp+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_map_q+"\t"+min_base_q+"\n")
|
||||
Loading…
Reference in New Issue