diff --git a/perl/reports/1KG/AssembleSlides.pl b/perl/reports/1KG/AssembleSlides.pl deleted file mode 100755 index a6ab31787..000000000 --- a/perl/reports/1KG/AssembleSlides.pl +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; - -my %args = &getCommandArguments("DIR" => undef, "OUT" => undef); - -my $outfile = $args{'OUT'}; -chomp(my @files = qx(find $args{'DIR'} -path \*slide\*.pdf | sort -n)); - -print join("\n", @files) . "\n"; - -my $cmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dAutoRotatePages=/All -sOutputFile=$outfile " . join(' ', @files); - -system($cmd); diff --git a/perl/reports/1KG/GenerateDataProcessingReport.pl b/perl/reports/1KG/GenerateDataProcessingReport.pl deleted file mode 100755 index 172e020fb..000000000 --- a/perl/reports/1KG/GenerateDataProcessingReport.pl +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use DistributedMake; -use DataTable; -use Data::Dumper; - -my %args = &getCommandArguments( - "OUT_DIR" => undef, - "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", - "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index" -); - -my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t"); -my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t"); - -my %results; - -foreach my $sequenceEntry (@sequenceIndex) { - my %sequenceEntry = %{$sequenceEntry}; - - if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) { - ${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; - - ${$results{'all'}}{'population'} = "all"; - ${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'}; - - ${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); - ${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); - - ${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); - ${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); - - if ($sequenceEntry{'WITHDRAWN'} == 0) { - ${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; - ${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; - } else { - ${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; - } - - ${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; - - ${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; - - ${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; - } -} - -foreach my $studyid (keys(%results)) { - my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}}; - my @samples = keys(%study); - my $numSamples = scalar(@samples); - - ${$results{$studyid}}{'num_samples'} = $numSamples; -} - -print Dumper(\%results); diff --git a/perl/reports/1KG/GenerateDataProcessingReport2.pl b/perl/reports/1KG/GenerateDataProcessingReport2.pl deleted file mode 100755 index dd3ed7bc9..000000000 --- a/perl/reports/1KG/GenerateDataProcessingReport2.pl +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use DistributedMake; -use DataTable; -use Data::Dumper; -use File::Basename; - -my %args = &getCommandArguments( - "LIST" => undef, - "OUT_DIR" => undef, - - "REFERENCE" => "/broad/1KG/reference/human_b36_both.fasta", - "DBSNP" => "/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod", - - "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", - "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index", - - "DRY_RUN" => 1, - "QUEUE" => "gsa", - "NUM_JOBS" => "", - "KEEP_GOING" => 1, -); - -my $dm = new DistributedMake("dryRun" => $args{'DRY_RUN'}, "queue" => $args{'QUEUE'}, "numJobs" => "", "keepGoing" => 1, "outputFile" => "$args{'OUT_DIR'}/GenerateDataProcessingReport2.log"); - -my @entries; - -open(LIST, $args{'LIST'}); -while (my $entry = ) { - chomp($entry); - - my %entry; - ($entry{'POP'}, $entry{'UG'}, $entry{'QCALL'}) = split(/\s+/, $entry); - - push(@entries, \%entry); -} -close(LIST); - -my @slides; - -foreach my $entry (@entries) { - my %entry = %{$entry}; - - # Combine VCFs - my $vcfCombineOut = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}.combined.vcf"; - my $vcfCombineOutCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VCFCombine -R $args{'REFERENCE'} -type UNION -B UG,VCF,$entry{'UG'} -B QCALL,VCF,$entry{'QCALL'} -A -priority 'UG,QCALL' -o $vcfCombineOut -l INFO -L 1"; - $dm->addRule($vcfCombineOut, [$entry{'UG'}, $entry{'QCALL'}], $vcfCombineOutCmd); - - # Evaluate variants - my $evalRoot = "$vcfCombineOut.eval/eval"; - my $evalCounts = "$evalRoot.Count_Variants.csv"; - my $evalCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VariantEval -R $args{'REFERENCE'} -D $args{'DBSNP'} -B eval,VCF,$vcfCombineOut -select 'set==\"Intersection\"' -selectName Intersection -select 'set==\"UG-filteredInOther\"' -selectName InUG-FilteredInQCALL -select 'set==\"QCALL-filteredInOther\"' -selectName InQCALL-FilteredInUG -select 'set==\"UG\"' -selectName UG -select 'set==\"QCALL\"' -selectName QCALL -reportType R -reportLocation $evalRoot -l INFO -L 1"; - $dm->addRule($evalCounts, $vcfCombineOut, $evalCmd); - - # Get SNPs per sample - my $snpsPerSample = "$evalRoot.persample.table"; - my $snpsPerSampleCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/GetSNPsPerSample.pl VCF=$vcfCombineOut OUT=$snpsPerSample"; - $dm->addRule($snpsPerSample, $vcfCombineOut, $snpsPerSampleCmd); - - # Create topsheet slide - my $slideDir = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}/slides"; - my $topSheetRoot = "$slideDir/$entry{'POP'}.000.topsheet"; - my $topSheetSlide = "$topSheetRoot.pdf"; - my $topSheetSlideCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeTopSheetSlide.pl VCF=$vcfCombineOut SEQUENCE_INDEX=$args{'SEQUENCE_INDEX'} ALIGNMENT_INDEX=$args{'ALIGNMENT_INDEX'} EVAL_ROOT=$evalRoot OUT=$topSheetRoot"; - #$dm->addRule($topSheetSlide, [$vcfCombineOut, $args{'SEQUENCE_INDEX'}, $args{'ALIGNMENT_INDEX'}, $evalCounts], $topSheetSlideCmd); - - # Create Venn diagram slide of variants - my $vennRoot = "$slideDir/$entry{'POP'}.010.venn"; - my $vennPdf = "$vennRoot.pdf"; - my $vennPdfCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeVennSlide.pl EVAL_ROOT=$evalRoot OUT_ROOT=$vennRoot POP=$entry{'POP'}"; - $dm->addRule($vennPdf, $evalCounts, $vennPdfCmd); - - # Create plots for other slides - my $plotRoot = "$evalRoot.plot"; - my $plotDummy = "$plotRoot.venn_per_ac.norm.pdf"; - my $plotCmd = "Rscript $ENV{'STING_DIR'}/perl/reports/1KG/MakeDataProcessingReportPlots.R $evalRoot $plotRoot $entry{'POP'}"; - $dm->addRule($plotDummy, [$evalCounts, $snpsPerSample], $plotCmd, 'queue' => ""); - - # Create other slides - my $dprSlideDummy = "$slideDir/$entry{'POP'}.050.eval.plot.titv.pdf"; - my $dprSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeDPRSlides.pl PLOT_ROOT=$plotRoot OUT_DIR=$slideDir POP=$entry{'POP'}"; - $dm->addRule($dprSlideDummy, $plotDummy, $dprSlidesCmd); - - push(@slides, $dprSlideDummy); -} - -my $assembledSlides = "$args{'OUT_DIR'}/dpr.pdf"; -my $assembledSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/AssembleSlides.pl DIR=$args{'OUT_DIR'} OUT=$assembledSlides"; -$dm->addRule($assembledSlides, \@slides, $assembledSlidesCmd); - -$dm->execute(); diff --git a/perl/reports/1KG/GetSNPsPerSample.pl b/perl/reports/1KG/GetSNPsPerSample.pl deleted file mode 100755 index 191681ced..000000000 --- a/perl/reports/1KG/GetSNPsPerSample.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; - -my %args = &getCommandArguments("VCF" => undef, "OUT" => "/dev/stdout"); - -my %table; -my %sets; - -open(VCF, $args{'VCF'}); - -my @header; -while (my $line = ) { - chomp($line); - if ($line =~ /#/) { - if ($line =~ /CHROM/) { - $line =~ s/#//g; - @header = split(/\s+/, $line); - } - } else { - my @columns = split(/\s+/, $line); - - if ($columns[0] eq '1') { - my %entry; - for (my $i = 0; $i <= $#columns; $i++) { - $entry{$header[$i]} = $columns[$i]; - } - - my ($set) = $entry{'INFO'} =~ /set=(\w+)/; - $sets{$set} = 1; - - for (my $i = 9; $i <= $#header; $i++) { - if ($entry{$header[$i]} =~ /0[\\\|\/]1/ || - $entry{$header[$i]} =~ /1[\\\|\/]0/ || - $entry{$header[$i]} =~ /1[\\\|\/]1/) { - - ${$table{$header[$i]}}{$set}++; - $sets{$set} = 1; - } - } - } - } -} -close(VCF); - -open(OUT, ">$args{'OUT'}"); - -print OUT "sample\t" . join("\t", sort { $a cmp $b } keys(%sets)) . "\n"; - -foreach my $sample (sort { $a cmp $b } keys(%table)) { - print OUT "$sample"; - - foreach my $set (sort { $a cmp $b } keys(%sets)) { - my $value = 0; - if (exists(${$table{$sample}}{$set})) { - $value = ${$table{$sample}}{$set}; - } - - print OUT "\t$value"; - } - - print OUT "\n"; -} - -close(OUT); diff --git a/perl/reports/1KG/MakeDPRSlides.pl b/perl/reports/1KG/MakeDPRSlides.pl deleted file mode 100755 index 302056e12..000000000 --- a/perl/reports/1KG/MakeDPRSlides.pl +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use File::Basename; -use DistributedMake; - -my %args = &getCommandArguments("PLOT_ROOT" => undef, "OUT_DIR" => undef, "POP" => undef); - -my $dm = new DistributedMake("dryRun" => 0, "numJobs" => "1"); - -my $timestamp = gmtime(); - -my @items = ( - { 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac", 'title' => 'Callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp }, - { 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac.norm", 'title' => 'Normalized callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count, normalized by the number of variants in the callset union for that allele count bin. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp }, - { 'plotprefix' => "$args{'PLOT_ROOT'}.acs", 'title' => 'Allele count spectrum', 'population' => $args{'POP'}, 'caption' => 'Allele count spectrum for all known, novel, and all variants. Individual contributions from separate callers (thin, dotted or dashed lines) as well as the callset intersection (bold, solid lines) are shown.', 'timestamp' => $timestamp }, - { 'plotprefix' => "$args{'PLOT_ROOT'}.titv", 'title' => 'Ti/Tv per allele count', 'population' => $args{'POP'}, 'caption' => 'Ratio of transition mutations to transversions, per allele count. Whole-genome expectation is approximately 2.1. Fluctuation at higher allele counts is typically due to fewer available variants for the calculation at those frequencies.', 'timestamp' => $timestamp }, - { 'plotprefix' => "$args{'PLOT_ROOT'}.counts_per_sample", 'title' => 'SNP calls per sample', 'population' => $args{'POP'}, 'caption' => 'Number of SNPs per sample. As samples in plot are all from the same or similar populations, with presumably similar heterozygosity, all samples should have roughly consistent performance. Outliers may indicate problematic samples - perhaps in coverage, identity QC (sample mixup), or quality of sequencing.', 'timestamp' => $timestamp }, -); - -my $idnum = 1; - -foreach my $item (@items) { - $idnum++; - my $id = "$args{'POP'}." . sprintf("%03d", $idnum*10); - - my %item = %$item; - - my $basename = "$id." . basename($item{'plotprefix'}); - my $table = "$args{'OUT_DIR'}/$basename.kvp"; - - open(TABLE, ">$table"); - foreach my $key (sort { $a cmp $b } keys(%item)) { - print TABLE "$key\t$item{$key}\n"; - } - close(TABLE); - - my $render = "$args{'OUT_DIR'}/$basename.tex"; - my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$table TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/plot_and_text_template.tex TEMPLATE_OUT=$render"; - $dm->addRule($render, "$item{'plotprefix'}.pdf", $renderCmd); - - my $pdf = "$args{'OUT_DIR'}/$basename.pdf"; - my $pdfdir = dirname($pdf); - my $pdfCmd = "pdflatex -output-directory $pdfdir $render"; - $dm->addRule($pdf, $render, $pdfCmd); -} - -$dm->execute(); diff --git a/perl/reports/1KG/MakeDataProcessingReportPlots.R b/perl/reports/1KG/MakeDataProcessingReportPlots.R deleted file mode 100644 index 119b07958..000000000 --- a/perl/reports/1KG/MakeDataProcessingReportPlots.R +++ /dev/null @@ -1,146 +0,0 @@ -args = commandArgs(TRUE); - -evalRoot = args[1]; -if (is.na(evalRoot)) { evalRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval"; } - -plotRoot = args[2]; -if (is.na(plotRoot)) { plotRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval.plot"; } - -population = args[3]; -if (is.na(population)) { population = "unknown population"; } - -fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); -fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); -fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); -fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); -fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); -fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); -fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); -fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); -fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); -fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); -fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); -fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); -filePerSample = paste(evalRoot, ".persample.table", sep=""); - -# Allele count spectrum -if (!is.na(plotRoot)) { - pdf(paste(plotRoot, ".acs.pdf", sep="")); -} else { - x11(); -} - -dataMetricsByAc = read.csv(fileMetricsByAc, header=TRUE, comment.char="#"); - -dataMetricsByAc.none.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "none" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),]; - -dataMetricsByAc.int.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),]; -dataMetricsByAc.int.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),]; -dataMetricsByAc.int.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),]; - -dataMetricsByAc.UG.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all; -dataMetricsByAc.UG.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known; -dataMetricsByAc.UG.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel; - -dataMetricsByAc.QCALL.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all; -dataMetricsByAc.QCALL.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known; -dataMetricsByAc.QCALL.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel; - - -plot(0, 0, type="n", xlab="Allele count", ylab="Number of variants", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(min(dataMetricsByAc.none.all$n), max(dataMetricsByAc.none.all$n)), main=paste("Allele count spectrum (", population, ")", sep=""), cex=1.2, cex.axis=1.2, cex.lab=1.2, bty="n"); - -points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$n, type="l", lwd=3, col="black"); -points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$n, type="l", lwd=3, col="blue"); -points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$n, type="l", lwd=3, col="red"); - -points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.UG.all$n, type="l", lwd=1, lty=2, col="black"); -points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.UG.known$n, type="l", lwd=1, lty=2, col="blue"); -points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.UG.novel$n, type="l", lwd=1, lty=2, col="red"); - -points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.QCALL.all$n, type="l", lwd=1, lty=3, col="black"); -points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.QCALL.known$n, type="l", lwd=1, lty=3, col="blue"); -points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.QCALL.novel$n, type="l", lwd=1, lty=3, col="red"); - -legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2); - -if (!is.na(plotRoot)) { dev.off(); } - -# Ti/Tv per allele count -if (!is.na(plotRoot)) { - pdf(paste(plotRoot, ".titv.pdf", sep="")); -} else { - x11(); -} - -plot(0, 0, type="n", xlab="Allele count", ylab="Transition/transversion ratio", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(0, max(dataMetricsByAc$Ti.Tv)), main=paste("Ti/Tv per allele count (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2, bty="n"); - -points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=3, lty=1, col="black"); -points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=3, lty=1, col="blue"); -points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=3, lty=1, col="red"); - -points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=2, col="black"); -points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=2, col="blue"); -points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=2, col="red"); - -points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=3, col="black"); -points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=3, col="blue"); -points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=3, col="red"); - -legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2); - -if (!is.na(plotRoot)) { dev.off(); } - -# Barplot (unnormalized) -if (!is.na(plotRoot)) { - pdf(paste(plotRoot, ".venn_per_ac.pdf", sep="")); -} else { - x11(); -} - -norm = dataMetricsByAc.UG.all$n + dataMetricsByAc.int.all$n + dataMetricsByAc.QCALL.all$n; -mat = rbind(dataMetricsByAc.QCALL.all$n, dataMetricsByAc.int.all$n, dataMetricsByAc.UG.all$n); -matnorm = rbind(dataMetricsByAc.QCALL.all$n/norm, dataMetricsByAc.int.all$n/norm, dataMetricsByAc.UG.all$n/norm); - -qcallcolor = "#FF5555"; -intcolor = "#5582C6"; -ugcolor = "#55BBFF"; - -barplot(mat, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="counts", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC); -legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor)); - -if (!is.na(plotRoot)) { dev.off(); } - -# Barplot (normalized) -if (!is.na(plotRoot)) { - pdf(paste(plotRoot, ".venn_per_ac.norm.pdf", sep="")); -} else { - x11(); -} - -barplot(matnorm, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="Fraction", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC, ylim=c(0, 1.3)); -legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor)); - -if (!is.na(plotRoot)) { dev.off(); } - -# Per-sample -if (!is.na(plotRoot)) { - pdf(paste(plotRoot, ".counts_per_sample.pdf", sep=""), width=10); -} else { - x11(width=10); -} - -dataPerSample = read.table(filePerSample, header=TRUE); - -par(mar=c(7, 5, 2, 1)); - -plot(0, 0, type="n", xaxt="n", xlim=c(1, length(dataPerSample$sample)), ylim=c(0, max(dataPerSample$Intersection + dataPerSample$UG, dataPerSample$Intersection, dataPerSample$Intersection + dataPerSample$QCALL)), xlab="", ylab="Counts", main=paste("SNPs per sample (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2); - -points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$UG, col=ugcolor); -points(dataPerSample$sample, dataPerSample$Intersection, col=intcolor); -points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$QCALL, col=qcallcolor); - -legend("bottomleft", c("UG", "Intersection", "QCALL"), fill=c(ugcolor, intcolor, qcallcolor)); - -axis(1, at=c(1:length(dataPerSample$sample)), labels=dataPerSample$sample, las=2, cex.axis=0.7); - -if (!is.na(plotRoot)) { dev.off(); } diff --git a/perl/reports/1KG/MakeTopSheetSlide.pl b/perl/reports/1KG/MakeTopSheetSlide.pl deleted file mode 100755 index 6fad34610..000000000 --- a/perl/reports/1KG/MakeTopSheetSlide.pl +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use DistributedMake; -use DataTable; -use Data::Dumper; - -my %args = &getCommandArguments( - "VCF" => undef, - "EVAL_ROOT" => undef, - "OUT" => undef, - - "POP" => undef, - "SRP_IDS" => undef, - "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", - "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index" -); - -my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t"); -my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t"); - -my %results; - -foreach my $sequenceEntry (@sequenceIndex) { - my %sequenceEntry = %{$sequenceEntry}; - - if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) { - ${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; - - ${$results{'all'}}{'population'} = "all"; - ${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'}; - - ${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); - ${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); - - ${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); - ${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); - - if ($sequenceEntry{'WITHDRAWN'} == 0) { - ${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; - ${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; - } else { - ${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; - } - - ${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; - - ${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; - - ${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; - ${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; - } -} - -foreach my $studyid (keys(%results)) { - my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}}; - my @samples = keys(%study); - my $numSamples = scalar(@samples); - - ${$results{$studyid}}{'num_samples'} = $numSamples; -} - -print Dumper(\%results); diff --git a/perl/reports/1KG/MakeVennSlide.pl b/perl/reports/1KG/MakeVennSlide.pl deleted file mode 100755 index 4357f3991..000000000 --- a/perl/reports/1KG/MakeVennSlide.pl +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use DistributedMake; -use Data::Dumper; -use File::Basename; - -sub getCSV { - my ($file) = @_; - - my @header; - my @csv; - - open(CSV, $file); - while (my $csvline = ) { - if ($csvline !~ /#/) { - chomp(my @columns = split(/,/, $csvline)); - - if (scalar(@columns) > 1) { - if (scalar(@header) < 2) { - @header = @columns; - } else { - my %entry; - for (my $column = 0; $column <= $#header; $column++) { - $entry{$header[$column]} = $columns[$column]; - } - - push(@csv, \%entry); - } - } - } - } - close(CSV); - - return @csv; -} - -my %args = &getCommandArguments("EVAL_ROOT" => undef, "OUT_ROOT" => undef, "POP" => undef); - -my %values; - -my @header; -my @counts = &getCSV("$args{'EVAL_ROOT'}.Count_Variants.csv"); -my @titv = &getCSV("$args{'EVAL_ROOT'}.Ti_slash_Tv_Variant_Evaluator.csv"); - -my $varout = "$args{'OUT_ROOT'}.kvp"; -open(VAROUT, ">$varout"); -foreach my $entry (@counts) { - my %entry = %$entry; - - print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.nVariantLoci\t$entry{'nVariantLoci'}\n"; - - if ($entry{'jexl_expression'} eq 'Intersection') { $values{'Intersection'} = $entry{'nVariantLoci'}; } - if ($entry{'jexl_expression'} eq 'UG') { $values{'UG-only'} = $entry{'nVariantLoci'}; } - if ($entry{'jexl_expression'} eq 'QCALL') { $values{'QCALL-only'} = $entry{'nVariantLoci'}; } -} - -foreach my $entry (@titv) { - my %entry = %$entry; - - print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.ti/tv_ratio\t$entry{'ti/tv ratio'}\n"; -} - -my $outpng = "$args{'OUT_ROOT'}.png"; -print VAROUT "venndiagram\t$args{'OUT_ROOT'}\n"; -print VAROUT "population\t$args{'POP'}\n"; -print VAROUT "timestamp\t" . gmtime() . "\n"; -close(VAROUT); - -my $ug = $values{'UG-only'} + $values{'Intersection'}; -my $qcall = $values{'QCALL-only'} + $values{'Intersection'}; - -my $scaledug = ($ug > $qcall) ? 100 : int($ug*100/$qcall); -my $scaledqcall = ($qcall > $ug) ? 100 : int($qcall*100/$ug); -my $scaledint = ($ug > $qcall) ? int($values{'Intersection'}*100/$ug) : int($values{'Intersection'}*100/$qcall); - -my $dm = new DistributedMake("dryRun" => 0); - -my $vennCmd = "wget -O $outpng \"http://chart.apis.google.com/chart?cht=v&chd=t:$scaledqcall,$scaledug,0,$scaledint&chco=FF3355,00BBFF&chs=600x480&chdl=QCALL|UnifiedGenotyper&chtt=Callset+concordance+($args{'POP'})&chts=000000,20&chma=100,0,0,0\""; -$dm->addRule($outpng, $varout, $vennCmd); - -my $render = "$args{'OUT_ROOT'}.tex"; -my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$varout TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/venn_template.tex TEMPLATE_OUT=$render"; -$dm->addRule($render, $outpng, $renderCmd); - -my $pdf = "$args{'OUT_ROOT'}.pdf"; -my $pdfdir = dirname($args{'OUT_ROOT'}); -my $pdfCmd = "pdflatex -output-directory $pdfdir $render"; -$dm->addRule($pdf, $render, $pdfCmd); - -$dm->execute(); diff --git a/perl/reports/1KG/RenderTemplate.pl b/perl/reports/1KG/RenderTemplate.pl deleted file mode 100755 index 8b980b247..000000000 --- a/perl/reports/1KG/RenderTemplate.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use FindBin; -use lib "$FindBin::Bin/../../"; - -use StingArgs; -use Data::Dumper; - -my %args = &getCommandArguments("KVP" => undef, "TEMPLATE" => undef, "TEMPLATE_OUT" => undef); - -my %vars; -open(KVP, $args{'KVP'}); -while (my $kvpline = ) { - chomp($kvpline); - - my ($key, $value) = split(/\t+/, $kvpline); - $vars{$key} = $value; -} -close(KVP); - -open(TEMPLATE, $args{'TEMPLATE'}); -open(TEMPLATE_OUT, ">$args{'TEMPLATE_OUT'}"); -while (my $line =