gatk-3.8/scala/qscript/chartl/private_mutations.q

import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
import org.broadinstitute.sting.queue.extensions.gatk.{UnifiedGenotyper, RodBind, CombineVariants, SelectVariants}
import org.broadinstitute.sting.queue.QScript
import tools.nsc.io.File

class private_mutations extends QScript {
  pm_script => // alias for the script arguments

  val eomi_merged_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_merged.vcf")
  val g1k_exome_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/g1k_exomes.vcf")
  val ceu_pilot_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/CEU.low_coverage.genotypes.vcf")

  val SINGLE_SAMPLE_EXPRESSION = "\"PASS.*AC=1;|PASS.*AC=2;.*1/1|PASS.*AC=2.*1\\|1\""
  val NOVEL_VARIANT_EXPRESSION = "-v \"rs[0-9]\""
  val KNOWN_VARIANT_EXPRESSION = "\"rs[0-9].*PASS\""

  val BASE_DIR = "/humgen/gsa-hphome1/chartl/projects/private_mutations/queue/"
  val LIST_DIR = BASE_DIR + "lists/"
  val VCF_DIR = BASE_DIR + "vcfs/"
  val EVAL_DIR = BASE_DIR + "evals/"
  val SCRATCH_DIR = BASE_DIR + "intermediate/"

  /*
   * Steps of analysis.
   *
   * todo -- 1) Calculate number of EOMI calls top-down, cumulative include/exclude that are:
   *  - present in one sample
   *  - not in dbsnp 129
   *  - not in 1000G pilot CEU
   *  - not in 1000G production EUR
   *  - not in 1000G production exome EUR
   *
   * todo -- 2) Distributional aspects of remaining mutations
   *  - Depth of Coverage
   *  - Functional Class
   *
   * todo -- 3) 1000G Exome v Lowpass -- Venn of single-sample variants
   *  - Venn numbers
   *  - Distribution of depth for exome variants missed in low-pass
   *
   * todo -- 4) Estimate relationship between depth and sensitivity to private variation
   *
   * todo -- 5) Calculate % of exome empowered to detect private variation
   *
   * todo -- 6) Identify loci across 1000G Ex and EOMI that are low-powered
   *   - venn of low-power targets
   *   - mapping the intersection
   *   - genes affected by low power
   */


  /*
   * Scripting commands go here
   */
  def script = {
    // setup analysis resources (best done in loop)
    val g1k_lowpass_chr : List[File] = (1 to 22).toList.map( i => new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chr%d/ALL.august.chr%d.recal.vrcut.EUR.vcf".format(i,i)))
    val eomi_annot_set : List[File] = (1 to 7).toList.map( i => new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_set%d.vcf".format(i)))
    val g1k_lowpass_hg19_merged : File = new File(VCF_DIR+"g1k_lowpass_merged.hg19.sites.vcf")
    val eomi_merged_hg19 : File = eomi_merged_calls
    val ceu_hg19 : File = new File(VCF_DIR+"G1K_Pilot.CEU.hg19.vcf")

    combineAndMergeSites(g1k_lowpass_chr,g1k_lowpass_hg19_merged)
    liftoverVCF(ceu_pilot_calls,ceu_hg19)


    // get EOMI ss-only list
    var eomi_single_sample : Vcf2List = new Vcf2List
    eomi_single_sample.in_vcf = eomi_merged_hg19
    eomi_single_sample.filter = SINGLE_SAMPLE_EXPRESSION
    eomi_single_sample.out_list = new File(LIST_DIR+"EOMI_single_sample_variants.sites.list")

    add(eomi_single_sample)

    var eomi_dbsnp : Vcf2List = new Vcf2List
    eomi_dbsnp.in_vcf = eomi_merged_hg19
    eomi_dbsnp.filter = KNOWN_VARIANT_EXPRESSION
    eomi_dbsnp.out_list = new File(LIST_DIR+"EOMI_novel_variants.sites.list")

    add(eomi_dbsnp)

    var ceu_sites : Vcf2List = new Vcf2List
    ceu_sites.in_vcf = ceu_hg19
    ceu_sites.out_list = new File(SCRATCH_DIR+"G1KP_CEU_variants.sites.list")

    add(ceu_sites)

    var g1k_lowpass_sites : Vcf2List = new Vcf2List
    g1k_lowpass_sites.in_vcf = g1k_lowpass_hg19_merged
    g1k_lowpass_sites.out_list = new File(SCRATCH_DIR+"G1K_lowpass_EUR.sites.list")

    add(g1k_lowpass_sites)

    var g1k_exome_sites : Vcf2List = new Vcf2List
    g1k_exome_sites.in_vcf = g1k_exome_calls
    g1k_exome_sites.out_list = new File(SCRATCH_DIR+"G1K_exomes.sites.list")

    add(g1k_exome_sites)

    var remove_dbsnp : RemoveIntersect = new RemoveIntersect
    remove_dbsnp.fileToUnique = eomi_single_sample.out_list
    remove_dbsnp.comparisonFile = eomi_dbsnp.out_list
    remove_dbsnp.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.list")

    add(remove_dbsnp)

    var remove_ceu : RemoveIntersect = new RemoveIntersect
    remove_ceu.fileToUnique = remove_dbsnp.outputFile
    remove_ceu.comparisonFile = ceu_sites.out_list
    remove_ceu.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.list")

    add(remove_ceu)

    var remove_lowpass : RemoveIntersect = new RemoveIntersect
    remove_lowpass.fileToUnique = remove_ceu.outputFile
    remove_lowpass.comparisonFile = g1k_lowpass_sites.out_list
    remove_lowpass.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.list")

    add(remove_lowpass)

    var remove_exome : RemoveIntersect = new RemoveIntersect
    remove_exome.fileToUnique = remove_lowpass.outputFile
    remove_exome.comparisonFile = g1k_exome_sites.out_list
    remove_exome.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.noexome.list")

    add(remove_exome)

    var subset : SelectVariants = new SelectVariants
    subset.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
    subset.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
    subset.variantVCF = eomi_merged_hg19
    subset.out = new File(VCF_DIR+"EOMI_merged.hg19.private.vcf")
    subset.intervals :+= remove_exome.outputFile

    add(subset)

    var varDepths : GetVariantDepths = new GetVariantDepths
    varDepths.inVCF = subset.out
    varDepths.outFile = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/results/EOMI.private.variant.depths.txt")

    add(varDepths)

    var lowpass_single : Vcf2List = new Vcf2List
    lowpass_single.in_vcf = g1k_lowpass_hg19_merged
    lowpass_single.filter = SINGLE_SAMPLE_EXPRESSION
    lowpass_single.out_list = new File(LIST_DIR+"g1k_lowpass_single_sample.variants.list")

    add(lowpass_single)

    var exome_single : Vcf2List = new Vcf2List
    exome_single.in_vcf = g1k_exome_calls
    exome_single.filter = SINGLE_SAMPLE_EXPRESSION
    exome_single.out_list = new File(LIST_DIR+"g1k_exome_single_sample.variants.list")

    add(exome_single)

    var exome_extract : ExtractSites = new ExtractSites(g1k_exome_calls,new File(VCF_DIR+"g1k_exome.sites.vcf"))
    add(exome_extract)

    var getVennExS : CombineVariants = new CombineVariants
    getVennExS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
    getVennExS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
    getVennExS.priority = "exome,lowpass"
    getVennExS.intervals :+= exome_single.out_list
    getVennExS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
    getVennExS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
    getVennExS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
    getVennExS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
    getVennExS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.exome.sites.vcf")

    //add(getVennExS)

    var getVennLPS : CombineVariants = new CombineVariants
    getVennLPS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
    getVennLPS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
    getVennLPS.priority = "exome,lowpass"
    getVennLPS.intervals :+= lowpass_single.out_list
    getVennLPS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
    getVennLPS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
    getVennLPS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
    getVennLPS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
    getVennLPS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.lowpass.sites.vcf")

    add(getVennLPS)

    var getG1KOverlap : GetOverlapSamples = new GetOverlapSamples
    getG1KOverlap.vcf1 = g1k_exome_calls
    getG1KOverlap.vcf2 = g1k_lowpass_chr(1) // EUR only
    getG1KOverlap.samples = new File(LIST_DIR+"g1k_EUR_exome_lowpass_overlap.txt")

    add(getG1KOverlap)

    //callOverlaps(getG1KOverlap.samples,exome_single.out_list, g1k_exome_calls)

  }

  /*
   * defined modules go here
   */

  class Vcf2List extends CommandLineFunction {
    @Input(doc="The vcf file to convert to list",required=true)
    var in_vcf: File = _
    @Argument(doc="An egrep-based filter to apply during conversion",required=false)
    var filter: String = "PASS"
    @Output(doc="The list file to write to", required=true)
    var out_list: File = _

    def commandLine = {
      "egrep %s %s | awk '{print $1\":\"$2}' > %s".format(filter,in_vcf.getAbsolutePath,out_list.getAbsolutePath)
    }
  }

  class GetOverlapSamples extends CommandLineFunction {
    @Input(doc="vcf1")
    var vcf1: File = _
    @Input(doc="vcf2")
    var vcf2: File = _
    @Output(doc="sample file")
    var samples: File = _

    def commandLine = {
      "head -n 500 %s %s | grep \\\\#CHR | cut -f10- | tr '\\t' '\\n' | sort | uniq -c | awk '{if ($1 == 2) print $2}' > %s".format(
        vcf1.getAbsolutePath,
        vcf2.getAbsolutePath,
        samples.getAbsolutePath
        )
    }
  }

  class RemoveIntersect extends CommandLineFunction {
    @Input(doc="The vcf file whose unique entries should be retained",required=true)
    var fileToUnique: File = _
    @Input(doc="The vcf file whose entries overlapping the unique file you would like to remove",required=true)
    var comparisonFile: File = _
    @Output(doc="The file containing only the fileToUnique-unique entries", required=true)
    var outputFile: File = _

    def commandLine = {
      var tmpFile: File = java.io.File.createTempFile("removeintersect","tmp")

      "cat %s %s | sort | uniq -c | awk '{if ($1==2) print $2}' > %s ; cat %s %s | sort | uniq -c | awk '{if ($1 == 1) print $2}' > %s".format(
      fileToUnique.getAbsolutePath,comparisonFile.getAbsolutePath, tmpFile.getAbsolutePath,
      tmpFile.getAbsolutePath, fileToUnique.getAbsolutePath, outputFile.getAbsolutePath
        )
    }
  }

  class ExtractSites(inVCF: File, oVCF: File) extends CommandLineFunction {
    @Input(doc="The vcf file to generate a sites only file from",required=true)
    var inputVCF: File = inVCF
    @Output(doc="The sites-only vcf to write to")
    var outputVCF: File = oVCF

    def commandLine = {
      "cut -f1-9 %s > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
    }
  }

  class ConcatVCF(in: List[File], out: File) extends CommandLineFunction {
    @Input(doc="The files to concatenate",required=true)
    var inputVCFs : List[File] = in
    @Output(doc="The file to write to", required=true)
    var outputVCF: File = out

    def commandLine = {
      var header : File = java.io.File.createTempFile("concatVCF","header.tmp")
      var body_unsorted : File = java.io.File.createTempFile("concatVCF","body.unsorted.tmp")
      var body : File = java.io.File.createTempFile("concatVCF","body.tmp")

      "grep \\\\# %s > %s ; grep -v \\\\# %s | sed 's/.vcf:/\\t/g' | cut -f2- > %s ; perl %s -tmp . %s %s > %s ; cat %s %s > %s".format(
        inputVCFs(0).getAbsolutePath,
        header.getAbsolutePath,
        inputVCFs.foldLeft[String]("")((x1: String, x2: File ) => x1 + " " + x2.getAbsolutePath ),
        body_unsorted.getAbsolutePath,
        "/humgen/gsa-scr1/chartl/sting/perl/sortByRef.pl",
        body_unsorted.getAbsolutePath,
        "/humgen/1kg/reference/human_g1k_v37.fasta.fai",
        body.getAbsolutePath,
        header.getAbsolutePath,
        body.getAbsolutePath,
        outputVCF.getAbsolutePath
        )
    }
  }

  class ExtractSample(vcfIn: File, sample: String, vcfOut: File) extends CommandLineFunction {
    @Input(doc="File from which to extract sample")
    var inVCF: File = vcfIn
    @Argument(doc="The sample to extract")
    var inSample: String = sample
    @Output(doc="The VCF file to write to")
    var outVCF: File = vcfOut

    def commandLine = {
      "head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | grep -n %s | tr ':' '\\t' | cut -f1 | xargs -i cut -f1-9,\\{\\} %s | egrep \"\\\\#|PASS.*0/1|PASS.*1/0|PASS.*1/1\" > %s".format(
        inVCF.getAbsolutePath, inSample, inVCF.getAbsolutePath, outVCF.getAbsolutePath
        )
    }
  }

  class GetVariantDepths extends CommandLineFunction {
    @Input(doc="The VCF to get the per-sample depth at variant calls")
    var inVCF : File = _
    @Output(doc="The depth file to write to")
    var outFile : File = _

    def commandLine = {
      "grep PASS %s | cut -f10- | tr '\\t' '\\n' | egrep \"1/0|0/1|1/1|1\\|0|0\\|1|1\\|1\" | tr ':' '\\t' | awk '{print $3}' > %s".format(
        inVCF.getAbsolutePath, outFile.getAbsolutePath
        )
    }
  }

  def combineAndMergeSites(chrInputs: List[File], output: File) = {
    var sites_only: List[ExtractSites] = chrInputs.map( f => new ExtractSites(f,new File(VCF_DIR+swapExt(f,".vcf",".sites.vcf").getName)))
    for ( s <- sites_only ) {
      add(s)
    }
    var trivialMerge : ConcatVCF = new ConcatVCF(sites_only.map( s => s.outputVCF ), output)
    add(trivialMerge)
  }

  def liftoverVCF(b36vcf: File, b37vcf: File) = {

    class Liftover(in: File, out: File) extends CommandLineFunction {
      @Input(doc="foo")
      val input: File = in
      @Output(doc="foo")
      val output : File = out

      val sting : String = "/humgen/gsa-scr1/chartl/sting/"
      val chain : String = "/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/b36ToHg19.broad.over.chain"

      def commandLine = {
        "%s/perl/liftOverVCF.pl -vcf %s -chain %s -out %s -gatk %s -newRef %s -oldRef %s -tmp .".format(
        sting, input, chain, output, sting, "/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19",
          "/humgen/1kg/reference/human_b36_both"
          )
      }

      class RMDupes(in: File, out:File) extends CommandLineFunction {
        @Input(doc="foo")
        var input : File = in
        @Output(doc="foo")
        var output : File = out

        def commandLine = "cat %s | python /humgen/gsa-scr1/chartl/projects/omni/scripts/filterDupes.py > %s".format(
        input.getAbsolutePath,output.getAbsolutePath
          )
      }

      var lift : Liftover = new Liftover(b36vcf,swapExt(b37vcf,".vcf",".undeduped.vcf"))

      add(lift)

      var rmDupes : RMDupes = new RMDupes(lift.output,b37vcf)

      add(rmDupes)
    }
  }

  def callOverlaps(samples: File, intervals: File, wexCalls: File)  = {
    class GetBamList(sample: String, oList: File) extends CommandLineFunction {
      @Argument(doc="foo")
      var inSample = sample
      @Output(doc="foo")
      var outList = oList
      @Input(doc = "foo")
      var waitForMe: File = _

      def commandLine = {
        "grep %s /humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/*.list | tr ':' '\\t' | awk '{print $2}' | sort | uniq > %s".format(
        inSample,
        outList.getAbsolutePath
          )
      }
    }
    for ( s <- scala.io.Source.fromFile(samples).getLines ) {
      var extract : ExtractSample = new ExtractSample(wexCalls,s,new File(VCF_DIR+"g1k_exome.subset.%s.vcf".format(s)))
      add(extract)

      var sites : Vcf2List = new Vcf2List
      sites.in_vcf = extract.outVCF
      sites.out_list = new File(SCRATCH_DIR+"g1k_exome.subset.%s.variants.list".format(s))
      add(sites)

      var bamList : GetBamList = new GetBamList(s,new File(SCRATCH_DIR+"%s.bams.list".format(s)))
      bamList.waitForMe = sites.out_list

      add(bamList)

      var genotype : UnifiedGenotyper = new UnifiedGenotyper
      genotype.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
      genotype.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
      genotype.intervals :+= sites.out_list
      genotype.out = new File(VCF_DIR+"g1k_lowpass.%s.exome_sites.vcf".format(s))
      genotype.input_file :+= bamList.outList
      genotype.memoryLimit = Some(3)
      genotype.output_all_callable_bases = true

      add(genotype)

    }
  }

}