From 3f3ef608e5e5a1d791965afdc34770e4846cd41d Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 24 Jul 2017 00:57:50 +0200 Subject: [PATCH] Archive GATK3-specific docs from the forum --- .../(howto)_Map_and_mark_duplicates.md | 63 ++ ...Perform_local_realignment_around_indels.md | 44 ++ ...e_a_reference_for_use_with_BWA_and_GATK.md | 45 ++ ...tions_Using_SnpEff_and_VariantAnnotator.md | 177 +++++ .../deprecated/BWA_C_Bindings_-_RETIRED.md | 336 ++++++++ .../Data_Processing_Pipeline_-_RETIRED.md | 158 ++++ ...or_VCF_files_not_being_ordered_properly.md | 16 + .../deprecated/Genotype_and_Validate.md | 76 ++ .../How_to_get_and_install_Firepony.md | 26 + doc_archive/deprecated/How_to_use_Firepony.md | 46 ++ .../Merging_batched_call_sets_-_RETIRED.md | 117 +++ ...ads_corresponding_to_a_genomic_interval.md | 5 + ...p_and_clean_up_short_read_sequence_data.md | 5 + ..._unmapped_BAM_from_FASTQ_or_aligned_BAM.md | 5 + ...plicates_or_MarkDuplicatesWithMateCigar.md | 5 + ...How_to)_Visualize_an_alignment_with_IGV.md | 5 + ...e_alignment_qualities_(BAQ)_in_the_GATK.md | 50 ++ .../Statistical_methods_used_by_GATK_tools.md | 90 +++ .../deprecated/Using_Variant_Annotator.md | 30 + ...Oct_2013_GATK_workshop_hands-on_session.md | 54 ++ ...Firepony_and_what_can_I_expect_from_it?.md | 16 + ...roughput_sequencing_concepts_and_terms?.md | 25 + .../Workshop_walkthrough_(Brussels_2014).md | 79 ++ ...rate_a_BAM_for_variant_discovery_(long).md | 476 +++++++++++ ...to)_Set_up_remote_debugging_in_IntelliJ.md | 28 + .../(howto)_Speed_up_GATK_compilation.md | 31 + ...:_AlignmentContext_and_ReadBackedPileup.md | 49 ++ ...ing_and_updating_dependencies_[RETIRED].md | 45 ++ .../developer-zone/Collecting_output.md | 34 + .../developer-zone/Documenting_walkers.md | 32 + .../Frequently_asked_questions_about_Scala.md | 78 ++ ...evelopment_process_and_coding_standards.md | 165 ++++ ...ing_samtools-jdk,_tribble,_and_variant).md | 13 + .../How_to_include_GATK_in_a_Maven_project.md | 41 + ..._walker_compatible_with_multi-threading.md | 36 + .../developer-zone/Managing_user_inputs.md | 289 +++++++ ...lker_data_presentation_and_flow_control.md | 102 +++ ...gration_from_Apache_Ant_to_Apache_Maven.md | 174 +++++ .../Notes_on_downsampling_in_HC_M2.md | 40 + .../developer-zone/Output_management.md | 113 +++ doc_archive/developer-zone/Scala_resources.md | 32 + ...deletion_spanning_reads_in_LocusWalkers.md | 48 ++ ...ronment:_Maven_and_IntelliJ_for_GATK_3+.md | 85 ++ .../developer-zone/Sting_to_GATK_renaming.md | 736 ++++++++++++++++++ doc_archive/developer-zone/Tribble.md | 119 +++ ...fferences_between_structured_data_files.md | 102 +++ .../Writing_GATKdocs_for_your_walkers.md | 56 ++ ...working_with_reference_metadata_classes.md | 60 ++ .../Writing_unit_tests_for_walkers.md | 133 ++++ doc_archive/developer-zone/Writing_walkers.md | 68 ++ .../Writing_walkers_in_Scala.md | 55 ++ doc_archive/dictionary/Bait_bias.md | 6 + .../Biallelic_vs_Multiallelic_sites.md | 19 + ...lfite_sequencing___Cytosine_methylation.md | 6 + doc_archive/dictionary/Downsampling.md | 44 ++ doc_archive/dictionary/Heterozygosity.md | 9 + doc_archive/dictionary/Hybrid_selection.md | 8 + doc_archive/dictionary/Jumping_libraries.md | 5 + .../Likelihoods_and_Probabilities.md | 16 + .../dictionary/Mate_unmapped_records.md | 19 + .../dictionary/OxoG_oxidative_artifacts.md | 12 + .../PF_reads___Illumina_chastity_filter.md | 11 + .../dictionary/Paired-end___mate-pair.md | 18 + doc_archive/dictionary/Parallelism.md | 86 ++ .../dictionary/Pedigree___PED_files.md | 37 + .../dictionary/Phred-scaled_Quality_Scores.md | 69 ++ ...adapter_artifacts_(in_hybrid_selection).md | 6 + doc_archive/dictionary/Read_groups.md | 65 ++ .../dictionary/Reference_Genome_Components.md | 79 ++ .../Spanning_or_overlapping_deletions.md | 15 + ..._to_the_same_sample_into_a_single_file?.md | 12 + ...int_calling_workflow_to_my_RNAseq_data?.md | 8 + ...an_I_use_GATK_on_non-diploid_organisms?.md | 19 + ...GATK_at_different_steps_of_my_analysis?.md | 18 + .../faqs/Collected_FAQs_about_VCF_files.md | 10 + ...files_for_sequence_read_data_(BAM_CRAM).md | 90 +++ .../Collected_FAQs_about_interval_lists.md | 40 + ...can_I_access_the_GSA_public_FTP_server?.md | 18 + ...nvoke_read_filters_and_their_arguments?.md | 14 + ...epare_a_FASTA_file_to_use_as_reference?.md | 114 +++ ...rn_on_or_customize_forum_notifications?.md | 16 + ...allelism_to_make_GATK_tools_run_faster?.md | 164 ++++ .../How_do_I_submit_a_detailed_bug_report?.md | 36 + ...he_GATK_handle_these_huge_NGS_datasets?.md | 9 + ...uld_I_cite_GATK_in_my_own_publications?.md | 25 + ...d_sequencing_and_multi-library_designs?.md | 53 ++ ..._Panel_of_Normals_for_somatic_analysis?.md | 11 + .../I'm_new_to_GATK._Where_do_I_start?.md | 45 ++ ...o_they_mean_and_why_are_they_important?.md | 18 + ...I_analyze_my_samples_alone_or_together?.md | 31 + ...typeCaller_to_call_variants_on_my_data?.md | 14 + ...e_resource_bundle_and_how_can_I_get_it?.md | 49 ++ ...are_the_prerequisites_for_running_GATK?.md | 12 + ..._attending_a_workshop_hands-on_session?.md | 11 + .../What_do_the_VariantEval_modules_do?.md | 263 +++++++ ...t_files_does_the_GATK_accept___require?.md | 66 ++ ...hone_Home\"_and_how_does_it_affect_me?.md" | 108 +++ ...relate_to_\"full\"_GATK_2.x?_[RETIRED].md" | 34 + ...why_are_GATK_tools_called_\"walkers\"?.md" | 28 + ...w_is_it_different_from_a_'regular'_VCF?.md | 90 +++ ...is_a_VCF_and_how_should_I_interpret_it?.md | 175 +++++ .../What_is_the_GATKReport_file_format?.md | 63 ++ ...erence_between_QUAL_and_GQ_annotations?.md | 16 + ...hat_is_the_structure_of_a_GATK_command?.md | 35 + ...STQ_for_storing_unmapped_sequence_data?.md | 7 + ...nown_variants_sites_for_running_tool_X?.md | 110 +++ ...ariants_can_GATK_tools_detect___handle?.md | 19 + ..._use_-L_to_pass_in_a_list_of_intervals?.md | 74 ++ ...can_I_get_a_gene_list_in_RefSeq_format?.md | 32 + .../Where_can_I_get_the_GATK_source_code?.md | 22 + ...for_reviewing_or_benchmarking_purposes?.md | 43 + .../Which_tools_use_pedigree_information?.md | 13 + ...rguments_should_I_use_for_running_VQSR?.md | 136 ++++ ..._VariantAnnotator_compared_to_UG_or_HC?.md | 13 + ...Base_Quality_Score_Recalibration_(BQSR).md | 230 ++++++ ...actices_for_Variant_Discovery_in_DNAseq.md | 38 + ...actices_for_Variant_Discovery_in_RNAseq.md | 41 + .../methods/Calling_variants_in_RNAseq.md | 80 ++ ..._using_the_HaplotypeCaller_in_GVCF_mode.md | 29 + ..._variants_from_different_files_into_one.md | 74 ++ ...uating_the_quality_of_a_variant_callset.md | 109 +++ .../methods/Genotype_Refinement_workflow.md | 76 ++ ...finement_workflow:_mathematical_details.md | 30 + ...overview:_How_the_HaplotypeCaller_works.md | 39 + ...ActiveRegions_by_measuring_data_entropy.md | 54 ++ ...re-assembly_and_haplotype_determination.md | 35 + ...ence_for_haplotypes_and_variant_alleles.md | 18 + ..._step_4:_Assigning_per-sample_genotypes.md | 51 ++ ...ller's_reference_confidence_model_works.md | 15 + ...on_to_the_GATK_Best_Practices_workflows.md | 20 + .../Local_Realignment_around_Indels.md | 40 + ...How_PL_is_calculated_in_HaplotypeCaller.md | 80 ++ ...ding_the_QUAL_score_and_its_limitations.md | 68 ++ .../Performing_sequence_coverage_analysis.md | 76 ++ ...se_and_operation_of_Read-backed_Phasing.md | 55 ++ ...ation:_PairedEndSingleSampleWf_pipeline.md | 729 +++++++++++++++++ ...ing_variants_of_interest_from_a_callset.md | 50 ++ ...tistical_methods:_Fisher’s_Exact_Test.md | 209 +++++ ...istical_methods:_Inbreeding_Coefficient.md | 45 ++ .../Statistical_methods:_Rank_Sum_Test.md | 57 ++ ..._generic_hard-filtering_recommendations.md | 75 ++ ...ect_variants_based_on_annotation_values.md | 73 ++ ...coverage_metrics_for_variant_evaluation.md | 24 + ...VariantEval_Evaluation_Modules_Glossary.md | 158 ++++ ...iant_Quality_Score_Recalibration_(VQSR).md | 68 ++ ...llele_Depth_(AD)_is_lower_than_expected.md | 66 ++ ...rror_message_\"RScript_exited_with_1\".md" | 36 + ...es_not_being_properly_ordered_or_sorted.md | 27 + ..._having_missing_or_incompatible_contigs.md | 65 ++ .../Errors_about_misencoded_quality_scores.md | 14 + ...rrors_about_read_group_(RG)_information.md | 37 + ...s_can_be_diagnosed_with_ValidateSamFile.md | 166 ++++ ...VQSR_(recalibration)_to_filter_variants.md | 55 ++ ...get_the_annotations_I_specified_with_-A.md | 27 + ...cific_site,_but_it's_not_getting_called.md | 32 + ...that_require_different_versions_of_Java.md | 11 + ...equently_asked_questions_about_QScripts.md | 95 +++ doc_archive/queue/Overview_of_Queue.md | 94 +++ .../queue/Pipelining_the_GATK_with_Queue.md | 188 +++++ .../QFunction_and_Command_Line_Options.md | 243 ++++++ .../queue/Queue_CommandLineFunctions.md | 133 ++++ .../queue/Queue_custom_job_schedulers.md | 77 ++ .../Queue_pipeline_scripts_(QScripts).md | 335 ++++++++ doc_archive/queue/Queue_with_Grid_Engine.md | 45 ++ doc_archive/queue/Queue_with_IntelliJ_IDEA.md | 170 ++++ doc_archive/queue/The_10+_Queuemandents.md | 20 + ...ng_unit___regression_tests_for_QScripts.md | 137 ++++ ...ads_corresponding_to_a_genomic_interval.md | 46 ++ .../(How_to)_Fix_a_badly_formatted_BAM.md | 92 +++ ..._unmapped_BAM_from_FASTQ_or_aligned_BAM.md | 125 +++ ...up_short_read_sequence_data_efficiently.md | 295 +++++++ ...ence_with_alternate_contigs_like_GRCh38.md | 273 +++++++ ...plicates_or_MarkDuplicatesWithMateCigar.md | 158 ++++ ...ads_using_a_reference_genome_ALT_contig.md | 87 +++ ...howto)_Apply_hard_filters_to_a_call_set.md | 107 +++ ...wto)_Call_variants_with_HaplotypeCaller.md | 50 ++ ...Call_variants_with_the_UnifiedGenotyper.md | 51 ++ ...ts_with_GATK_-_A_GATK_Workshop_Tutorial.md | 262 +++++++ ...llset_with_CollectVariantCallingMetrics.md | 47 ++ ...to)_Evaluate_a_callset_with_VariantEval.md | 66 ++ ...typeCaller_has_remapped_sequence_reads.md" | 28 + ...ired_to_follow_the_GATK_Best_Practices..md | 136 ++++ ...to)_Install_software_for_GATK_workshops.md | 130 ++++ ...Perform_local_realignment_around_indels.md | 155 ++++ ...alibrate_base_quality_scores_=_run_BQSR.md | 75 ++ ...brate_variant_quality_scores_=_run_VQSR.md | 252 ++++++ ...owto)_Revert_a_BAM_file_to_FastQ_format.md | 47 ++ .../(howto)_Run_Queue_for_the_first_time.md | 90 +++ ...(howto)_Run_the_GATK_for_the_first_time.md | 165 ++++ ...o)_Run_the_genotype_refinement_workflow.md | 22 + .../(howto)_Test_your_GATK_installation.md | 71 ++ .../(howto)_Test_your_Queue_installation.md | 100 +++ ...(howto)_Visualize_an_alignment_with_IGV.md | 61 ++ ..._to_(howto)_Discover_variants_with_GATK.md | 67 ++ .../Tutorial_files_provenance:_ASHG15.md | 98 +++ 195 files changed, 15402 insertions(+) create mode 100644 doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md create mode 100644 doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md create mode 100644 doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md create mode 100644 doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md create mode 100644 doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md create mode 100644 doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md create mode 100644 doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md create mode 100644 doc_archive/deprecated/Genotype_and_Validate.md create mode 100644 doc_archive/deprecated/How_to_get_and_install_Firepony.md create mode 100644 doc_archive/deprecated/How_to_use_Firepony.md create mode 100644 doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md create mode 100644 doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md create mode 100644 doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md create mode 100644 doc_archive/deprecated/Using_Variant_Annotator.md create mode 100644 doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md create mode 100644 doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md create mode 100644 doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md create mode 100644 doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md create mode 100644 doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md create mode 100644 doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md create mode 100644 doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md create mode 100644 doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md create mode 100644 doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md create mode 100644 doc_archive/developer-zone/Collecting_output.md create mode 100644 doc_archive/developer-zone/Documenting_walkers.md create mode 100644 doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md create mode 100644 doc_archive/developer-zone/GATK_development_process_and_coding_standards.md create mode 100644 doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md create mode 100644 doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md create mode 100644 doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md create mode 100644 doc_archive/developer-zone/Managing_user_inputs.md create mode 100644 doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md create mode 100644 doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md create mode 100644 doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md create mode 100644 doc_archive/developer-zone/Output_management.md create mode 100644 doc_archive/developer-zone/Scala_resources.md create mode 100644 doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md create mode 100644 doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md create mode 100644 doc_archive/developer-zone/Sting_to_GATK_renaming.md create mode 100644 doc_archive/developer-zone/Tribble.md create mode 100644 doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md create mode 100644 doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md create mode 100644 doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md create mode 100644 doc_archive/developer-zone/Writing_unit_tests_for_walkers.md create mode 100644 doc_archive/developer-zone/Writing_walkers.md create mode 100644 doc_archive/developer-zone/Writing_walkers_in_Scala.md create mode 100644 doc_archive/dictionary/Bait_bias.md create mode 100644 doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md create mode 100644 doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md create mode 100644 doc_archive/dictionary/Downsampling.md create mode 100644 doc_archive/dictionary/Heterozygosity.md create mode 100644 doc_archive/dictionary/Hybrid_selection.md create mode 100644 doc_archive/dictionary/Jumping_libraries.md create mode 100644 doc_archive/dictionary/Likelihoods_and_Probabilities.md create mode 100644 doc_archive/dictionary/Mate_unmapped_records.md create mode 100644 doc_archive/dictionary/OxoG_oxidative_artifacts.md create mode 100644 doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md create mode 100644 doc_archive/dictionary/Paired-end___mate-pair.md create mode 100644 doc_archive/dictionary/Parallelism.md create mode 100644 doc_archive/dictionary/Pedigree___PED_files.md create mode 100644 doc_archive/dictionary/Phred-scaled_Quality_Scores.md create mode 100644 doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md create mode 100644 doc_archive/dictionary/Read_groups.md create mode 100644 doc_archive/dictionary/Reference_Genome_Components.md create mode 100644 doc_archive/dictionary/Spanning_or_overlapping_deletions.md create mode 100644 doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md create mode 100644 doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md create mode 100644 doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md create mode 100644 doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md create mode 100644 doc_archive/faqs/Collected_FAQs_about_VCF_files.md create mode 100644 doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md create mode 100644 doc_archive/faqs/Collected_FAQs_about_interval_lists.md create mode 100644 doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md create mode 100644 doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md create mode 100644 doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md create mode 100644 doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md create mode 100644 doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md create mode 100644 doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md create mode 100644 doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md create mode 100644 doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md create mode 100644 doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md create mode 100644 doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md create mode 100644 doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md create mode 100644 doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md create mode 100644 doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md create mode 100644 doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md create mode 100644 doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md create mode 100644 doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md create mode 100644 doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md create mode 100644 doc_archive/faqs/What_do_the_VariantEval_modules_do?.md create mode 100644 doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md create mode 100644 "doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md" create mode 100644 "doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md" create mode 100644 "doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md" create mode 100644 doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md create mode 100644 doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md create mode 100644 doc_archive/faqs/What_is_the_GATKReport_file_format?.md create mode 100644 doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md create mode 100644 doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md create mode 100644 doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md create mode 100644 doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md create mode 100644 doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md create mode 100644 doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md create mode 100644 doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md create mode 100644 doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md create mode 100644 doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md create mode 100644 doc_archive/faqs/Which_tools_use_pedigree_information?.md create mode 100644 doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md create mode 100644 doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md create mode 100644 doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md create mode 100644 doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md create mode 100644 doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md create mode 100644 doc_archive/methods/Calling_variants_in_RNAseq.md create mode 100644 doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md create mode 100644 doc_archive/methods/Combining_variants_from_different_files_into_one.md create mode 100644 doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md create mode 100644 doc_archive/methods/Genotype_Refinement_workflow.md create mode 100644 doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md create mode 100644 doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md create mode 100644 doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md create mode 100644 doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md create mode 100644 doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md create mode 100644 doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md create mode 100644 doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md create mode 100644 doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md create mode 100644 doc_archive/methods/Local_Realignment_around_Indels.md create mode 100644 doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md create mode 100644 doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md create mode 100644 doc_archive/methods/Performing_sequence_coverage_analysis.md create mode 100644 doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md create mode 100644 doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md create mode 100644 doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md create mode 100644 doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md create mode 100644 doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md create mode 100644 doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md create mode 100644 doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md create mode 100644 doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md create mode 100644 doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md create mode 100644 doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md create mode 100644 doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md create mode 100644 doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md create mode 100644 "doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md" create mode 100644 doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md create mode 100644 doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md create mode 100644 doc_archive/problems/Errors_about_misencoded_quality_scores.md create mode 100644 doc_archive/problems/Errors_about_read_group_(RG)_information.md create mode 100644 doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md create mode 100644 doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md create mode 100644 doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md create mode 100644 doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md create mode 100644 doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md create mode 100644 doc_archive/queue/Frequently_asked_questions_about_QScripts.md create mode 100644 doc_archive/queue/Overview_of_Queue.md create mode 100644 doc_archive/queue/Pipelining_the_GATK_with_Queue.md create mode 100644 doc_archive/queue/QFunction_and_Command_Line_Options.md create mode 100644 doc_archive/queue/Queue_CommandLineFunctions.md create mode 100644 doc_archive/queue/Queue_custom_job_schedulers.md create mode 100644 doc_archive/queue/Queue_pipeline_scripts_(QScripts).md create mode 100644 doc_archive/queue/Queue_with_Grid_Engine.md create mode 100644 doc_archive/queue/Queue_with_IntelliJ_IDEA.md create mode 100644 doc_archive/queue/The_10+_Queuemandents.md create mode 100644 doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md create mode 100644 doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md create mode 100644 doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md create mode 100644 doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md create mode 100644 doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md create mode 100644 doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md create mode 100644 doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md create mode 100644 doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md create mode 100644 doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md create mode 100644 doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md create mode 100644 doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md create mode 100644 doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md create mode 100644 doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md create mode 100644 doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md create mode 100644 "doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md" create mode 100644 doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md create mode 100644 doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md create mode 100644 doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md create mode 100644 doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md create mode 100644 doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md create mode 100644 doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md create mode 100644 doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md create mode 100644 doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md create mode 100644 doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md create mode 100644 doc_archive/tutorials/(howto)_Test_your_GATK_installation.md create mode 100644 doc_archive/tutorials/(howto)_Test_your_Queue_installation.md create mode 100644 doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md create mode 100644 doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md create mode 100644 doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md diff --git a/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md b/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md new file mode 100644 index 000000000..7ae29d6d8 --- /dev/null +++ b/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md @@ -0,0 +1,63 @@ +## (howto) Map and mark duplicates + +http://gatkforums.broadinstitute.org/gatk/discussion/2799/howto-map-and-mark-duplicates + +
+

See Tutorial#6747 for a comparison of MarkDuplicates and MarkDuplicatesWithMateCigar, downloadable example data to follow along, and additional commentary.

+
+
+

Objective

+

Map the read data to the reference and mark duplicates.

+

Prerequisites

+ +

Steps

+
    +
  1. Identify read group information
  2. +
  3. Generate a SAM file containing aligned reads
  4. +
  5. Convert to BAM, sort and mark duplicates
  6. +
+
+

1. Identify read group information

+

The read group information is key for downstream GATK functionality. The GATK will not work without a read group tag. Make sure to enter as much metadata as you know about your data in the read group fields provided. For more information about all the possible fields in the @RG tag, take a look at the SAM specification.

+

Action

+

Compose the read group identifier in the following format:

+
@RG\tID:group1\tSM:sample1\tPL:illumina\tLB:lib1\tPU:unit1 
+

where the \t stands for the tab character.

+
+

2. Generate a SAM file containing aligned reads

+

Action

+

Run the following BWA command:

+

In this command, replace read group info by the read group identifier composed in the previous step.

+
bwa mem -M -R ’<read group info>’ -p reference.fa raw_reads.fq > aligned_reads.sam 
+

replacing the <read group info> bit with the read group identifier you composed at the previous step.

+

The -M flag causes BWA to mark shorter split hits as secondary (essential for Picard compatibility).

+

Expected Result

+

This creates a file called aligned_reads.sam containing the aligned reads from all input files, combined, annotated and aligned to the same reference.

+

Note that here we are using a command that is specific for pair end data in an interleaved (read pairs together in the same file, with the forward read followed directly by its paired reverse read) fastq file, which is what we are providing to you as a tutorial file. To map other types of datasets (e.g. single-ended or pair-ended in forward/reverse read files) you will need to adapt the command accordingly. Please see the BWA documentation for exact usage and more options for these commands.

+
+

3. Convert to BAM, sort and mark duplicates

+

These initial pre-processing operations format the data to suit the requirements of the GATK tools.

+

Action

+

Run the following Picard command to sort the SAM file and convert it to BAM:

+
java -jar picard.jar SortSam \ 
+    INPUT=aligned_reads.sam \ 
+    OUTPUT=sorted_reads.bam \ 
+    SORT_ORDER=coordinate 
+

Expected Results

+

This creates a file called sorted_reads.bam containing the aligned reads sorted by coordinate.

+

Action

+

Run the following Picard command to mark duplicates:

+
java -jar picard.jar MarkDuplicates \ 
+    INPUT=sorted_reads.bam \ 
+    OUTPUT=dedup_reads.bam \
+    METRICS_FILE=metrics.txt
+

Expected Result

+

This creates a sorted BAM file called dedup_reads.bam with the same content as the input file, except that any duplicate reads are marked as such. It also produces a metrics file called metrics.txt containing (can you guess?) metrics.

+

Action

+

Run the following Picard command to index the BAM file:

+
java -jar picard.jar BuildBamIndex \ 
+    INPUT=dedup_reads.bam 
+

Expected Result

+

This creates an index file for the BAM file called dedup_reads.bai.

\ No newline at end of file diff --git a/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md b/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md new file mode 100644 index 000000000..856a0d0c2 --- /dev/null +++ b/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md @@ -0,0 +1,44 @@ +## (howto) Perform local realignment around indels + +http://gatkforums.broadinstitute.org/gatk/discussion/2800/howto-perform-local-realignment-around-indels + +

NOTE: This tutorial has been replaced by a more recent and much improved version that you can find here.

+

Objective

+

Perform local realignment around indels to correct mapping-related artifacts.

+

Prerequisites

+ +

Steps

+
    +
  1. Create a target list of intervals to be realigned
  2. +
  3. Perform realignment of the target intervals
  4. +
+
+

1. Create a target list of intervals to be realigned

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T RealignerTargetCreator \ 
+    -R reference.fa \ 
+    -I dedup_reads.bam \ 
+    -L 20 \ 
+    -known gold_indels.vcf \ 
+    -o realignment_targets.list
+

Expected Result

+

This creates a file called realignment_targets.list containing the list of intervals that the program identified as needing realignment within our target, chromosome 20.

+

The list of known indel sites (gold_indels.vcf) are used as targets for realignment. Only use it if there is such a list for your organism.

+
+

2. Perform realignment of the target intervals

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T IndelRealigner \ 
+    -R reference.fa \ 
+    -I dedup_reads.bam \ 
+    -targetIntervals realignment_targets.list \ 
+    -known gold_indels.vcf \ 
+    -o realigned_reads.bam 
+

Expected Result

+

This creates a file called realigned_reads.bam containing all the original reads, but with better local alignments in the regions that were realigned.

+

Note that here, we didn’t include the -L 20 argument. It's not necessary since the program will only run on the target intervals we are providing.

\ No newline at end of file diff --git a/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md b/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md new file mode 100644 index 000000000..c87c52972 --- /dev/null +++ b/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md @@ -0,0 +1,45 @@ +## (howto) Prepare a reference for use with BWA and GATK + +http://gatkforums.broadinstitute.org/gatk/discussion/2798/howto-prepare-a-reference-for-use-with-bwa-and-gatk + +

NOTE: This tutorial has been replaced by a more recent version that uses GRCh38 that you can find here.

+
+

Objective

+

Prepare a reference sequence so that it is suitable for use with BWA and GATK.

+

Prerequisites

+ +

Steps

+
    +
  1. Generate the BWA index
  2. +
  3. Generate the Fasta file index
  4. +
  5. Generate the sequence dictionary
  6. +
+
+

1. Generate the BWA index

+

Action

+

Run the following BWA command:

+
bwa index -a bwtsw reference.fa 
+

where -a bwtsw specifies that we want to use the indexing algorithm that is capable of handling the whole human genome.

+

Expected Result

+

This creates a collection of files used by BWA to perform the alignment.

+
+

2. Generate the fasta file index

+

Action

+

Run the following SAMtools command:

+
samtools faidx reference.fa 
+

Expected Result

+

This creates a file called reference.fa.fai, with one record per line for each of the contigs in the FASTA reference file. Each record is composed of the contig name, size, location, basesPerLine and bytesPerLine.

+
+

3. Generate the sequence dictionary

+

Action

+

Run the following Picard command:

+
java -jar picard.jar CreateSequenceDictionary \
+    REFERENCE=reference.fa \ 
+    OUTPUT=reference.dict 
+

Note that this is the new syntax for use with the latest version of Picard. Older versions used a slightly different syntax because all the tools were in separate jars, so you'd call e.g. java -jar CreateSequenceDictionary.jar directly.

+

Expected Result

+

This creates a file called reference.dict formatted like a SAM header, describing the contents of your reference FASTA file.

\ No newline at end of file diff --git a/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md b/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md new file mode 100644 index 000000000..3063b1b25 --- /dev/null +++ b/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md @@ -0,0 +1,177 @@ +## Adding Genomic Annotations Using SnpEff and VariantAnnotator + +http://gatkforums.broadinstitute.org/gatk/discussion/50/adding-genomic-annotations-using-snpeff-and-variantannotator + +

This article is out of date and no longer applicable. At this time, we do not provide support for performing functional annotation. Programs that we are aware of and that our collaborators use successfully include Oncotator and Variant Effect Predictor (VEP).

+
+

Our testing has shown that not all combinations of snpEff/database versions produce high-quality results. Be sure to read this document completely to familiarize yourself with our recommended best practices BEFORE running snpEff.

+

Introduction

+

Until recently we were using an in-house annotation tool for genomic annotation, but the burden of keeping the database current and our lack of ability to annotate indels has led us to employ the use of a third-party tool instead. After reviewing many external tools (including annoVar, VAT, and Oncotator), we decided that SnpEff best meets our needs as it accepts VCF files as input, can annotate a full exome callset (including indels) in seconds, and provides continually-updated transcript databases. We have implemented support in the GATK for parsing the output from the SnpEff tool and annotating VCFs with the information provided in it.

+

SnpEff Setup and Usage

+

Download the SnpEff core program. If you want to be able to run VariantAnnotator on the SnpEff output, you'll need to download a version of SnpEff that VariantAnnotator supports from this page (currently supported versions are listed below). If you just want the most recent version of SnpEff and don't plan to run VariantAnnotator on its output, you can get it from here.

+

After unzipping the core program, open the file snpEff.config in a text editor, and change the "database_repository" line to the following:

+
database_repository = http://sourceforge.net/projects/snpeff/files/databases/
+

Then, download one or more databases using SnpEff's built-in download command:

+
java -jar snpEff.jar download GRCh37.64
+

You can find a list of available databases here. The human genome databases have GRCh or hg in their names. You can also download the databases directly from the SnpEff website, if you prefer.

+

The download command by default puts the databases into a subdirectory called data within the directory containing the SnpEff jar file. If you want the databases in a different directory, you'll need to edit the data_dir entry in the file snpEff.config to point to the correct directory.

+

Run SnpEff on the file containing your variants, and redirect its output to a file. SnpEff supports many input file formats including VCF 4.1, BED, and SAM pileup. Full details and command-line options can be found on the SnpEff home page.

+

Supported SnpEff Versions

+

If you want to take advantage of SnpEff integration in the GATK, you'll need to run SnpEff version *2.0.5. Note: newer versions are currently unsupported by the GATK, as we haven't yet had the reources to test it.

+

Current Recommended Best Practices When Running SnpEff

+

These best practices are based on our analysis of various snpEff/database versions as described in detail in the Analysis of SnpEff Annotations Across Versions section below.

+ +

Analyses of SnpEff Annotations Across Versions

+

See our analysis of the SNP annotations produced by snpEff across various snpEff/database versions here.

+ +

See our comparison of SNP annotations produced using the GRCh37.64 and GRCh37.65 databases with snpEff 2.0.5 here

+ +

See our analysis of the INDEL annotations produced by snpEff across snpEff/database versions here

+ +

Example SnpEff Usage with a VCF Input File

+

Below is an example of how to run SnpEff version 2.0.5 with a VCF input file and have it write its output in VCF format as well. Notice that you need to explicitly specify the database you want to use (in this case, GRCh37.64). This database must be present in a directory of the same name within the data_dir as defined in snpEff.config.

+
java -Xmx4G -jar snpEff.jar eff -v -onlyCoding true -i vcf -o vcf GRCh37.64 1000G.exomes.vcf > snpEff_output.vcf
+

In this mode, SnpEff aggregates all effects associated with each variant record together into a single INFO field annotation with the key EFF. The general format is:

+
EFF=Effect1(Information about Effect1),Effect2(Information about Effect2),etc.
+

And here is the precise layout with all the subfields:

+
EFF=Effect1(Effect_Impact|Effect_Functional_Class|Codon_Change|Amino_Acid_Change|Gene_Name|Gene_BioType|Coding|Transcript_ID|Exon_ID),Effect2(etc...
+

It's also possible to get SnpEff to output in a (non-VCF) text format with one Effect per line. See the SnpEff home page for full details.

+

Adding SnpEff Annotations using VariantAnnotator

+

Once you have a SnpEff output VCF file, you can use the VariantAnnotator walker to add SnpEff annotations based on that output to the input file you ran SnpEff on.

+

There are two different options for doing this:

+

Option 1: Annotate with only the highest-impact effect for each variant

+

NOTE: This option works only with supported SnpEff versions as explained above. VariantAnnotator run as described below will refuse to parse SnpEff output files produced by other versions of the tool, or which lack a SnpEff version number in their header.

+

The default behavior when you run VariantAnnotator on a SnpEff output file is to parse the complete set of effects resulting from the current variant, select the most biologically-significant effect, and add annotations for just that effect to the INFO field of the VCF record for the current variant. This is the mode we plan to use in our Production Data-Processing Pipeline.

+

When selecting the most biologically-significant effect associated with the current variant, VariantAnnotator does the following:

+ +

Example Usage:

+
java -jar dist/GenomeAnalysisTK.jar \
+     -T VariantAnnotator \
+     -R /humgen/1kg/reference/human_g1k_v37.fasta \
+     -A SnpEff \       
+     --variant 1000G.exomes.vcf \        (file to annotate)
+     --snpEffFile snpEff_output.vcf \    (SnpEff VCF output file generated by running SnpEff on the file to annotate)
+     -L 1000G.exomes.vcf \
+     -o out.vcf
+

VariantAnnotator adds some or all of the following INFO field annotations to each variant record:

+ +

Example VCF records annotated using SnpEff and VariantAnnotator:

+
1   874779  .   C   T   279.94  . AC=1;AF=0.0032;AN=310;BaseQRankSum=-1.800;DP=3371;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=1.4493;InbreedingCoeff=-0.0045;
+MQ=54.49;MQ0=10;MQRankSum=0.982;QD=13.33;ReadPosRankSum=-0.060;SB=-120.09;SNPEFF_AMINO_ACID_CHANGE=G215;SNPEFF_CODON_CHANGE=ggC/ggT;
+SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_874655_874840;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;
+SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066
+
+1   874816  .   C   CT  2527.52 .   AC=15;AF=0.0484;AN=310;BaseQRankSum=-11.876;DP=4718;FS=48.575;HRun=1;HaplotypeScore=91.9147;InbreedingCoeff=-0.0520;
+MQ=53.37;MQ0=6;MQRankSum=-1.388;QD=5.92;ReadPosRankSum=-1.932;SB=-741.06;SNPEFF_EFFECT=FRAME_SHIFT;SNPEFF_EXON_ID=exon_1_874655_874840;
+SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=HIGH;SNPEFF_TRANSCRIPT_ID=ENST00000342066
+

Option 2: Annotate with all effects for each variant

+

VariantAnnotator also has the ability to take the EFF field from the SnpEff VCF output file containing all the effects aggregated together and copy it verbatim into the VCF to annotate.

+

Here's an example of how to do this:

+
java -jar dist/GenomeAnalysisTK.jar \
+     -T VariantAnnotator \
+     -R /humgen/1kg/reference/human_g1k_v37.fasta \      
+     -E resource.EFF \
+     --variant 1000G.exomes.vcf \      (file to annotate)
+     --resource snpEff_output.vcf \    (SnpEff VCF output file generated by running SnpEff on the file to annotate)
+     -L 1000G.exomes.vcf \
+     -o out.vcf
+

Of course, in this case you can also use the VCF output by SnpEff directly, but if you are using VariantAnnotator for other purposes anyway the above might be useful.

+

List of Genomic Effects

+

Below are the possible genomic effects recognized by SnpEff, grouped by biological impact. Full descriptions of each effect are available on this page.

+

High-Impact Effects

+ +

Moderate-Impact Effects

+ +

Low-Impact Effects

+ +

Modifiers

+ +

Functional Classes

+

SnpEff assigns a functional class to certain effects, in addition to an impact:

+ +

The GATK prioritizes effects with functional classes over effects of equal impact that lack a functional class when selecting the most significant effect in VariantAnnotator. This is to enable accurate counts of NONSENSE/MISSENSE/SILENT sites.

\ No newline at end of file diff --git a/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md b/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md new file mode 100644 index 000000000..2e4694066 --- /dev/null +++ b/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md @@ -0,0 +1,336 @@ +## BWA/C Bindings - RETIRED + +http://gatkforums.broadinstitute.org/gatk/discussion/60/bwa-c-bindings-retired + +

Please note that this article has not been updated in a very long time and may no longer be applicable. Use at your own risk.

+
+

Sting BWA/C Bindings

+

WARNING: This tool was experimental and unsupported and never made it beyond a beta version. Use at your own risk. +

The GSA group has made bindings available for Heng Li's Burrows-Wheeler Aligner (BWA). Our aligner bindings present additional functionality to the user not traditionally available with BWA. BWA standalone is optimized to do fast, low-memory alignments from Fastq to BAM. While our bindings aim to provide support for reasonably fast, reasonably low memory alignment, we add the capacity to do exploratory data analyses. The bindings can provide all alignments for a given read, allowing a user to walk over the alignments and see information not typically provided in the BAM format. Users of the bindings can 'go deep', selectively relaxing alignment parameters one read at a time, looking for the best alignments at a site. +

The BWA/C bindings should be thought of as alpha release quality. However, we aim to be particularly responsive to issues in the bindings as they arise. Because of the bindings' alpha state, some functionality is limited; see the Limitations section below for more details on what features are currently supported. +

+

Contents

+ +
+

A note about using the bindings

+

Whenever native code is called from Java, the user must assist Java in finding the proper shared library. Java looks for shared libraries in two places, on the system-wide library search path and through Java properties invoked on the command line. To add libbwa.so to the global library search path, add the following to your .my.bashrc, .my.cshrc, or other startup file: +

+
bash
+
+export LD_LIBRARY_PATH=/humgen/gsa-scr1/GATK_Data/bwa/stable:$LD_LIBRARY_PATH
+
+
csh
+
+setenv LD_LIBRARY_PATH /humgen/gsa-scr1/GATK_Data/bwa/stable:$LD_LIBRARY_PATH
+
+

To specify the location of libbwa.so directly on the command-line, use the java.library.path system property as follows: +

+
+java -Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T AlignmentValidation \
+    -I /humgen/gsa-hphome1/hanna/reference/1kg/NA12878_Pilot1_20.bwa.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta
+
+

Preparing to use the aligner

+

Within the Broad Institute

+

We provide internally accessible versions of both the BWA shared library and precomputed BWA indices for two commonly used human references at the Broad (Homo_sapiens_assembly18.fasta and human_b36_both.fasta). These files live in the following directory: +

+
+/humgen/gsa-scr1/GATK_Data/bwa/stable
+
+

Outside of the Broad Institute

+

Two steps are required in preparing to use the aligner: building the shared library and using BWA/C to generate an index of the reference sequence. +

The Java bindings to the aligner are available through the Sting repository. A precompiled version of the bindings are available for Linux; +these bindings are available in c/bwa/libbwa.so.1. To build the aligner from source: +

+ +
+sh autogen.sh
+./configure
+make
+
+ +

To build a reference sequence, use the BWA C executable directly: +

+
+bwa index -a bwtsw <your reference sequence>.fasta
+
+

Using the existing GATK alignment walkers

+

Two walkers are provided for end users of the GATK. The first of the stock walkers is Align, which can align an unmapped BAM file or realign a mapped BAM file. +

+
+java \
+-Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T Align \
+    -I NA12878_Pilot1_20.unmapped.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta \
+    -U \
+    -ob human.unsorted.bam
+
+

Most of the available parameters here are standard GATK. -T specifies that the alignment analysis should be used; -I specifies the unmapped BAM file to align, and the -R specifies the reference to which to align. By default, this walker assumes that the bwa index support files will live alongside the reference. If these files are stored elsewhere, the optional -BWT argument can be used to specify their location. By defaults, alignments will be emitted to the console in SAM format. Alignments can be spooled to disk in SAM format using the -o option or spooled to disk in BAM format using the -ob option. +

The other stock walker is AlignmentValidation, which computes all possible alignments based on the BWA default configuration settings and makes sure at least +one of the top alignments matches the alignment stored in the read. +

+
+java \
+-Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T AlignmentValidation \
+    -I /humgen/gsa-hphome1/hanna/reference/1kg/NA12878_Pilot1_20.bwa.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta
+
+

Options for the AlignmentValidation walker are identical to the Alignment walker, except the AlignmentValidation walker's only output is a exception if validation fails. +

Another sample walker of limited scope, CountBestAlignmentsWalker, is available for review; it is discussed in the example section below. +

+

Writing new GATK walkers utilizing alignment bindings

+

BWA/C can be created on-the-fly using the org.broadinstitute.sting.alignment.bwa.c.BWACAligner constructor. The bindings have two sets of interfaces: an interface which returns all possible alignments +and an interface which randomly selects an alignment from a list of the top scoring alignments as selected by BWA. +

To iterate through all functions, use the following method: +

+
+    /**
+     * Get a iterator of alignments, batched by mapping quality.
+     * @param bases List of bases.
+     * @return Iterator to alignments.
+     */
+    public Iterable<Alignment[]> getAllAlignments(final byte[] bases);
+
+

The call will return an Iterable which batches alignments by score. Each call to next() on the provided iterator will return all Alignments of a given score, ordered in +best to worst. For example, given a read sequence with at least one match on the genome, the first call to next() will supply all exact matches, and subsequent calls +to next() will give alignments judged to be inferior by BWA (alignments containing mismatches, gap opens, or gap extensions). +

Alignments can be transformed to reads using the following static method in org.broadinstitute.sting.alignment.Alignment: +

+
+    /**
+     * Creates a read directly from an alignment.
+     * @param alignment The alignment to convert to a read.
+     * @param unmappedRead Source of the unmapped read.  Should have bases, quality scores, and flags.
+     * @param newSAMHeader The new SAM header to use in creating this read.  Can be null, but if so, the sequence
+     *                     dictionary in the
+     * @return A mapped alignment.
+     */
+    public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader);
+
+

A convenience method is available which allows the user to get SAMRecords directly from the aligner. +

+
+    /**
+     * Get a iterator of aligned reads, batched by mapping quality.
+     * @param read Read to align.
+     * @param newHeader Optional new header to use when aligning the read.  If present, it must be null.
+     * @return Iterator to alignments.
+     */
+    public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader);
+
+

To return a single read randomly selected by the bindings, use one of the following methods: +

+
+    /**
+     * Allow the aligner to choose one alignment randomly from the pile of best alignments.
+     * @param bases Bases to align.
+     * @return An align
+     */
+    public Alignment getBestAlignment(final byte[] bases);
+
+    /**
+     * Align the read to the reference.
+     * @param read Read to align.
+     * @param header Optional header to drop in place.
+     * @return A list of the alignments.
+     */
+    public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
+
+

The org.broadinstitute.sting.alignment.bwa.BWAConfiguration argument allows the user to specify parameters normally specified to 'bwt aln'. Available parameters are: +

+ +

Settings must be supplied to the constructor; leaving any BWAConfiguration field unset means that BWA should use its default value for that argument. Configuration +settings can be updated at any time using the BWACAligner updateConfiguration method. +

+
+    public void updateConfiguration(BWAConfiguration configuration);
+
+

Running the aligner outside of the GATK

+

The BWA/C bindings were written with running outside of the GATK in mind, but this workflow has never been tested. If you would like to run the bindings outside of the +GATK, you will need: +

+ +

To build the packaged version of the aligner, run the following command +

+
+cp $STING_HOME/lib/bcel-*.jar ~/.ant/lib
+ant package -Dexecutable=Aligner
+
+

This command will extract all classes required to run the aligner and place them in $STING_HOME/dist/packages/Aligner/Aligner.jar. You can then specify this one jar in your project's dependencies. +

+

Limitations

+

The BWA/C bindings are currently in an alpha state, but they are extensively supported. Because of the bindings' alpha state, some functionality is limited. The limitations of these bindings include: +

+ +

Example: analysis of alignments with the BWA bindings

+

In order to validate that the Java bindings were computing the same number of reads as BWA/C standalone, we modified the BWA source to gather the number of equally scoring alignments and the frequency of the number of equally scoring alignments. We then implemented the same using a walker written in the GATK. We computed this distribution over a set of 36bp human reads and found the distributions to be identical. +

The relevant parts of the walker follow. +

+
+public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
+    /**
+     * The supporting BWT index generated using BWT.
+     */
+    @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
+    String prefix = null;
+
+    /**
+     * The actual aligner.
+     */
+    private Aligner aligner = null;
+
+    private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
+
+    /**
+     * Create an aligner object.  The aligner object will load and hold the BWT until close() is called.
+     */
+    @Override
+    public void initialize() {
+        BWTFiles bwtFiles = new BWTFiles(prefix);
+        BWAConfiguration configuration = new BWAConfiguration();
+        aligner = new BWACAligner(bwtFiles,configuration);
+    }
+
+    /**
+     * Aligns a read to the given reference.
+     * @param ref Reference over the read.  Read will most likely be unmapped, so ref will be null.
+     * @param read Read to align.
+     * @return Number of alignments found for this read.
+     */
+    @Override
+    public Integer map(char[] ref, SAMRecord read) {
+        Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
+        if(alignmentIterator.hasNext()) {
+            int numAlignments = alignmentIterator.next().length;
+            if(alignmentFrequencies.containsKey(numAlignments))
+                alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
+            else
+                alignmentFrequencies.put(numAlignments,1);
+        }
+        return 1;
+    }    
+
+    /**
+     * Initial value for reduce.  In this case, validated reads will be counted.
+     * @return 0, indicating no reads yet validated.
+     */
+    @Override
+    public Integer reduceInit() { return 0; }
+
+    /**
+     * Calculates the number of reads processed.
+     * @param value Number of reads processed by this map.
+     * @param sum Number of reads processed before this map.
+     * @return Number of reads processed up to and including this map.
+     */
+    @Override
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    /**
+     * Cleanup.
+     * @param result Number of reads processed.
+     */
+    @Override
+    public void onTraversalDone(Integer result) {
+        aligner.close();
+        for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
+            out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
+        super.onTraversalDone(result);
+    }
+}
+
+

This walker can be run within the svn version of the GATK using -T CountBestAlignments. +

The resulting placement count frequency is shown in the graph below. The number of placements clearly follows an exponential. +

Bwa dist.png +

+

Validation methods

+

Two major techniques were used to validate the Java bindings against the current BWA implementation. +

+ +

As an ongoing validation strategy, we will use the GATK integration test suite to align a small unmapped BAM file with human data. The contents of the unmapped BAM file will be aligned and written to disk. The md5 of the resulting file will be calculated and compared to a known good md5. +

+

Unsupported: using the BWA/C bindings from within Matlab

+

Some users are attempting to use the BWA/C bindings from within Matlab. To run the GATK within Matlab, you'll need to add libbwa.so to your library path through the librarypath.txt file. The librarypath.txt file normally lives in $matlabroot/toolbox/local. Within the Broad Institute, the $matlabroot/toolbox/local/librarypath.txt file is shared; therefore, you'll have to create a librarypath.txt file in your working directory from which you execute matlab. +

+
+##
+## FILE: librarypath.txt
+##
+## Entries:
+##    o path_to_jnifile
+##    o [alpha,glnx86,sol2,unix,win32,mac]=path_to_jnifile
+##    o $matlabroot/path_to_jnifile
+##    o $jre_home/path_to_jnifile
+##
+$matlabroot/bin/$arch
+/humgen/gsa-scr1/GATK_Data/bwa/stable
+
+

Once you've edited the library path, you can verify that Matlab has picked up your modified file by running the following command: +

+
+>> java.lang.System.getProperty('java.library.path')
+
+ans =
+/broad/tools/apps/matlab2009b/bin/glnxa64:/humgen/gsa-scr1/GATK_Data/bwa/stable
+
+

Once the location of libbwa.so has been added to the library path, you can use the BWACAligner just as you would any other Java class in Matlab: +

+
+>> javaclasspath({'/humgen/gsa-scr1/hanna/src/Sting/dist/packages/Aligner/Aligner.jar'})
+>> import org.broadinstitute.sting.alignment.bwa.BWTFiles
+>> import org.broadinstitute.sting.alignment.bwa.BWAConfiguration
+>> import org.broadinstitute.sting.alignment.bwa.c.BWACAligner
+>> x = BWACAligner(BWTFiles('/humgen/gsa-scr1/GATK_Data/bwa/Homo_sapiens_assembly18.fasta'),BWAConfiguration())
+>> y=x.getAllAlignments(uint8('CCAATAACCAAGGCTGTTAGGTATTTTATCAGCAATGTGGGATAAGCAC'));
+
+

We don't have the resources to directly support using the BWA/C bindings from within Matlab, but if you report problems to us, we will try to address them. +

\ No newline at end of file diff --git a/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md b/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md new file mode 100644 index 000000000..926dc0271 --- /dev/null +++ b/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md @@ -0,0 +1,158 @@ +## Data Processing Pipeline - RETIRED + +http://gatkforums.broadinstitute.org/gatk/discussion/41/data-processing-pipeline-retired + +

Please note that the DataProcessingPipeline qscript is no longer available. We are looking into the possibility of producing some new Qscripts that will be more appropriate for sharing with the public.

+

The DPP script was only provided has an example, but many people were using it "out of the box" without properly understanding how it works. In order to protect users from mishandling this tool, and to decrease our support burden, we have taken the difficult decision of removing the script from our public repository. If you would like to put together your own version of the DPP, please have a look at our other example scripts to understand how Qscripts work, and read the Best Practices documentation to understand what are the processing steps and what parameters you need to set/adjust.

+

Data Processing Pipeline

+

The Data Processing Pipeline is a Queue script designed to take BAM files from the NGS machines to analysis ready BAMs for the GATK.

+

Introduction

+

Reads come off the sequencers in a raw state that is not suitable for analysis using the GATK. In order to prepare the dataset, one must perform the steps described here. This pipeline performs the following steps: indel cleaning, duplicate marking and base score recalibration, following the GSA's latest definition of best practices. The product of this pipeline is a set of analysis ready BAM files (one per sample sequenced).

+

Requirements

+

This pipeline is a Queue script that uses tools from the GATK, Picard and BWA (optional) software suites which are all freely available through their respective websites. Queue is a GATK companion that is included in the GATK package.

+

Warning: This pipeline was designed specifically to handle the Broad Institute's main sequencing pipeline with Illumina BAM files and BWA alignment. The GSA cannot support its use for other types of datasets. It is possible however, with some effort, to modify it for your needs.

+

Command-line arguments

+

Required Parameters

+ + + + + + + + + + + + + + + + +
Argument (short-name) + Argument (long-name) + Description +
-i <BAM file / BAM list> --input <BAM file / BAM list> input BAM file - or list of BAM files. +
-R <fasta> --reference <fasta> Reference fasta file. +
-D <vcf> --dbsnp <dbsnp vcf> dbsnp ROD to use (must be in VCF format). +
+

Optional Parameters

+ + + + + + + + + + + + + + + + + + + + + + + + +
Argument (short-name) + Argument (long-name) + Description +
-indels <vcf> --extra_indels <vcf> VCF files to use as reference indels for Indel Realignment. +
-bwa <path> --path_to_bwa <path> The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option) +
-outputDir <path> --output_directory <path> Output path for the processed BAM files. +
-L <GATK interval string> --gatk_interval_string <GATK interval string> the -L interval string to be used by GATK - output bams at interval only +
-intervals <GATK interval file> --gatk_interval_file <GATK interval file> an intervals file to be used by GATK - output bams at intervals +
+

Modes of Operation (also optional parameters)

+ + + + + + + + + + + + + + + + + + + + + + + + +
Argument (short-name) + Argument (long-name) + Description +
-p <name> --project <name> the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam +
-knowns --knowns_only Perform cleaning on knowns only. +
-sw --use_smith_waterman Perform cleaning using Smith Waterman +
-bwase --use_bwa_single_ended Decompose input BAM file and fully realign it using BWA and assume Single Ended reads +
-bwape --use_bwa_pair_ended Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads +
+

The Pipeline

+

Data processing pipeline of the best practices for raw data processing, from sequencer data (fastq files) to analysis read reads (bam file):

+

the data processing pipeline

+

Following the group's Best Practices definition, the data processing pipeline does all the processing at the sample level. There are two high-level parts of the pipeline:

+

BWA alignment

+

This option is for datasets that have already been processed using a different pipeline or different criteria, and you want to reprocess it using this pipeline. One example is a BAM file that has been processed at the lane level, or did not perform some of the best practices steps of the current pipeline. By using the optional BWA stage of the processing pipeline, your BAM file will be realigned from scratch before creating sample level bams and entering the pipeline.

+

Sample Level Processing

+

This is the where the pipeline applies its main procedures: Indel Realignment and Base Quality Score Recalibration.

+

Indel Realignment

+

This is a two step process. First we create targets using the Realigner Target Creator (either for knowns only, or including data indels), then we realign the targets using the Indel Realigner (see [Local realignment around indels]) with an optional smith waterman realignment. The Indel Realigner also fixes mate pair information for reads that get realigned.

+

Base Quality Score Recalibration

+

This is a crucial step that re-adjusts the quality score using statistics based on several different covariates. In this pipeline we utilize four: Read Group Covariate, Quality Score Covariate, Cycle Covariate, Dinucleotide Covariate

+

The Outputs

+

The Data Processing Pipeline produces 3 types of output for each file: a fully processed bam file, a validation report on the input bam and output bam files, a analysis before and after base quality score recalibration. If you look at the pipeline flowchart, the grey boxes indicate processes that generate an output.

+

Processed Bam File

+

The final product of the pipeline is one BAM file per sample in the dataset. It also provides one BAM list with all the bams in the dataset. This file is named <project name>.cohort.list, and each sample bam file has the name <project name>.<sample name>.bam. The sample names are extracted from the input BAM headers, and the project name is provided as a parameter to the pipeline.

+

Validation Files

+

We validate each unprocessed sample level BAM file and each final processed sample level BAM file. The validation is performed using Picard's ValidateSamFile. Because the parameters of this validation are very strict, we don't enforce that the input BAM has to pass all validation, but we provide the log of the validation as an informative companion to your input. The validation file is named : <project name>.<sample name>.pre.validation and <project name>.<sample name>.post.validation.

+

Notice that even if your BAM file fails validation, the pipeline can still go through successfully. The validation is a strict report on how your BAM file is looking. Some errors are not critical, but the output files (both pre.validation and post.validation) should give you some input on how to make your dataset better organized in the BAM format.

+

Base Quality Score Recalibration Analysis

+

PDF plots of the base qualities are generated before and after recalibration for further analysis on the impact of recalibrating the base quality scores in each sample file. These graphs are explained in detail here. The plots are created in directories named : <project name>.<sample name>.pre and <project name>.<sample name>.post.

+

Examples

+
    +
  1. +

    Example script that runs the data processing pipeline with its standard parameters and uses LSF for scatter/gathering (without bwa)

    +

    java \ +-Xmx4g \ +-Djava.io.tmpdir=/path/to/tmpdir \ +-jar path/to/GATK/Queue.jar \ +-S path/to/DataProcessingPipeline.scala \ +-p myFancyProjectName \ +-i myDataSet.list \ +-R reference.fasta \ +-D dbSNP.vcf \ +-run

    +
  2. +
  3. +

    Performing realignment and the full data processing pipeline in one pair-ended bam file

    +

    java \ +-Xmx4g \ +-Djava.io.tmpdir=/path/to/tmpdir \ +-jar path/to/Queue.jar \ +-S path/to/DataProcessingPipeline.scala \ +-bwa path/to/bwa \ +-i test.bam \ +-R reference.fasta \ +-D dbSNP.vcf \ +-p myProjectWithRealignment \ +-bwape \ +-run

    +
  4. +
\ No newline at end of file diff --git a/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md b/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md new file mode 100644 index 000000000..f0ca487b9 --- /dev/null +++ b/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md @@ -0,0 +1,16 @@ +## Errors about BAM or VCF files not being ordered properly + +http://gatkforums.broadinstitute.org/gatk/discussion/58/errors-about-bam-or-vcf-files-not-being-ordered-properly + +

This article has been deprecated

+

For a more recent version please see https://www.broadinstitute.org/gatk/guide/article?id=1328

+
+

This error occurs when for example, a collaborator gives you a BAM that's derived from what was originally the same reference as you are using, but for whatever reason the contigs are not sorted in the same order .The GATK can be particular about the ordering of a BAM file so it will fail with an error in this case.

+

So what do you do? You use a Picard tool called ReorderSam to, well, reorder your BAM file.

+

Here's an example usage where we reorder a BAM file that was sorted lexicographically so that the output will be another BAM, but this time sorted karyotypically :

+
java -jar picard.jar ReorderSam \
+    I= lexicographic.bam \
+    O= kayrotypic.bam \
+    REFERENCE= Homo_sapiens_assembly18.kayrotypic.fasta
+

This tool requires you have a correctly sorted version of the reference sequence you used to align your reads. Be aware that this tool will drop reads that don't have equivalent contigs in the new reference (potentially bad, but maybe not). If contigs have the same name in the bam and the new reference, this tool assumes that the alignment of the read in the new BAM is the same. This is not a liftover tool!

+

This tool is part of the Picard package.

\ No newline at end of file diff --git a/doc_archive/deprecated/Genotype_and_Validate.md b/doc_archive/deprecated/Genotype_and_Validate.md new file mode 100644 index 000000000..7d6789989 --- /dev/null +++ b/doc_archive/deprecated/Genotype_and_Validate.md @@ -0,0 +1,76 @@ +## Genotype and Validate + +http://gatkforums.broadinstitute.org/gatk/discussion/61/genotype-and-validate + +

Please note that this article has not been updated in a very long time and may no longer be applicable. Use at your own risk.

+
+

Introduction

+

Genotype and Validate is a tool to asses the quality of a technology dataset for calling SNPs and Indels given a secondary (validation) datasource.

+

The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you want to know how well a particular technology performs calling these snps. With a dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's dataset.

+

Another option is to validate the calls on a VCF file, using a deep coverage BAM file that you trust the calls on. The GenotypeAndValidate walker will make calls using the reads in the BAM file and take them as truth, then compare to the calls in the VCF file and produce a truth table.

+

Command-line arguments

+

Usage of GenotypeAndValidate and its command line arguments are described here.

+

The VCF Annotations

+

The annotations can be either true positive (T) or false positive (F). 'T' means it is known to be a true SNP/Indel, while a 'F' means it is known not to be a SNP/Indel but the technology used to create the VCF calls it. To annotate the VCF, simply add an INFO field GV with the value T or F.

+

The Outputs

+

GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true positive or a false positive). The table should look like this:

+ + + + + + + + + + + + + + + +
+ ALT + REF + Predictive Value +
called alt True Positive (TP) False Positive (FP) Positive PV +
called ref False Negative (FN) True Negative (TN) Negative PV +
+

The positive predictive value (PPV) is the proportion of subjects with positive test results who are correctly diagnose.

+

The negative predictive value (NPV) is the proportion of subjects with a negative test result who are correctly diagnosed.

+

The optional VCF file will contain only the variants that were called or not called, excluding the ones that were uncovered or didn't pass the filters (-depth). This file is useful if you are trying to compare the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to apples).

+

Additional Details

+ +

Examples

+

Genotypes BAM file from new technology using the VCF as a truth dataset:

+
java \
+    -jar /GenomeAnalysisTK.jar \
+    -T  GenotypeAndValidate \
+    -R human_g1k_v37.fasta \
+    -I myNewTechReads.bam \
+    -alleles handAnnotatedVCF.vcf \
+    -BTI alleles \
+    -o gav.vcf
+

An annotated VCF example (info field clipped for clarity)

+
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+

Using a BAM file as the truth dataset:

+
java \
+    -jar /GenomeAnalysisTK.jar \
+    -T  GenotypeAndValidate \
+    -R human_g1k_v37.fasta \
+    -I myTruthDataset.bam \
+    -alleles callsToValidate.vcf \
+    -BTI alleles \
+    -bt \
+    -o gav.vcf
+

Example truth table of PacBio reads (BAM) to validate HiSeq annotated dataset (VCF) using the GenotypeAndValidate walker:

+

PacBio PbGenotypeAndValidate results

\ No newline at end of file diff --git a/doc_archive/deprecated/How_to_get_and_install_Firepony.md b/doc_archive/deprecated/How_to_get_and_install_Firepony.md new file mode 100644 index 000000000..9fe764acc --- /dev/null +++ b/doc_archive/deprecated/How_to_get_and_install_Firepony.md @@ -0,0 +1,26 @@ +## How to get and install Firepony + +http://gatkforums.broadinstitute.org/gatk/discussion/6020/how-to-get-and-install-firepony + +

Binary packages for various versions of Linux are available at http://packages.shadau.com/

+

Below are installation instructions for Debian, Ubunto, CentOS and Fedora. For other Linux distributions, the Firepony source code is available at https://github.com/broadinstitute/firepony along with compilation instructions.

+
+

On Debian or Ubuntu systems

+

The following commands can be used to install Firepony:

+
sudo apt-get install software-properties-common
+sudo add-apt-repository http://packages.shadau.com/
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 285514D704F4CDB7
+sudo apt-get update
+sudo apt-get install firepony
+

Once this initial install is done, updates will be automatically installed as part of the standard Ubuntu/Debian update procedure.

+
+

On CentOS 7 and Fedora 21 systems

+

On CentOS 7, the following commands can be used to install Firepony:

+
sudo curl -o /etc/yum.repos.d/packages.shadau.com.repo \
+    http://packages.shadau.com/rpm/centos-7/packages.shadau.com.repo
+sudo yum install firepony
+

For Fedora 21, use the following sequence of commands:

+
sudo curl -o /etc/yum.repos.d/packages.shadau.com.repo \
+    http://packages.shadau.com/rpm/fedora-21/packages.shadau.com.repo
+sudo yum install firepony
+

Any subsequent updates will automatically be installed when running ‘yum update’.

\ No newline at end of file diff --git a/doc_archive/deprecated/How_to_use_Firepony.md b/doc_archive/deprecated/How_to_use_Firepony.md new file mode 100644 index 000000000..5be90b98b --- /dev/null +++ b/doc_archive/deprecated/How_to_use_Firepony.md @@ -0,0 +1,46 @@ +## How to use Firepony + +http://gatkforums.broadinstitute.org/gatk/discussion/6021/how-to-use-firepony + +

Firepony can be run with the following command line arguments:

+
firepony -r <reference FASTA file> -s <SNP database file> -o <output table file> <input alignment file>
+

where:

+ +

Firepony will load an index for the reference file if it exists, which enables on-demand loading of reference sequences as the SNP database is loaded.

+

For example, the following GATK command line:

+
java -Xmx8g GenomeAnalysisTK-3.4.jar \
+    -T BaseRecalibrator \
+    -I NA12878D_HiSeqX_R1.deduplicated.bam \
+    -R /store/ref/hs37d5.fa \
+    -knownSites /store/dbsnp/dbsnp_138.b37.vcf \
+    -o recal_data.table
+

would be replaced by the following Firepony command line:

+
firepony \
+    -r /store/ref/hs37d5.fa -s /store/dbsnp/dbsnp_138.b37.vcf \
+    -o recal_data.table NA12878D_HiSeqX_R1.deduplicated.bam
+

Additional command line options are described in the help output for firepony invoked by

+
`firepony --help`
+

Note that it is recommended to use the BCF format rather than VCF for SNP databases when running Firepony. Both generate the same results, but loading BCF files is much more efficient.

+

At the moment, Firepony only supports recalibrating Illumina reads with the default GATK BQSR parameters, listed below in BQSR table format. Expanding the parameter set as well as the number of supported instruments will be done based on user feedback.

+
#:GATKTable:Arguments:Recalibration argument collection values used in this run
+Argument                    Value
+binary_tag_name             null
+covariate                   ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate
+default_platform            null
+deletions_default_quality   45
+force_platform              null
+indels_context_size         3
+insertions_default_quality  45
+low_quality_tail            2
+maximum_cycle_value         500
+mismatches_context_size     2
+mismatches_default_quality  -1
+no_standard_covs            false
+quantizing_levels           16
+recalibration_report        null
+run_without_dbsnp           false
+solid_nocall_strategy       THROW_EXCEPTION
+solid_recal_mode            SET_Q_ZERO
\ No newline at end of file diff --git a/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md b/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md new file mode 100644 index 000000000..d44e9ed9e --- /dev/null +++ b/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md @@ -0,0 +1,117 @@ +## Merging batched call sets - RETIRED + +http://gatkforums.broadinstitute.org/gatk/discussion/46/merging-batched-call-sets-retired + +

This procedure is deprecated since it is no longer necessary and goes against our Best Practices recommendations. For calling variants on multiple samples, use the Best Practices workflow for performing variant discovery using HaplotypeCaller.

+
+

Introduction

+

Three-stage procedure:

+ +

Creating the master set of sites: SNPs and Indels

+

The first step of batch merging is to create a master set of sites that you want to genotype in all samples. To make this problem concrete, suppose I have two VCF files:

+

Batch 1:

+
##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12891 
+20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30
+20      10000000        .       T       G       .       PASS    .       GT:GQ   0/1:30
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000211        .       C       T       .       PASS    .       GT:GQ   0/1:30
+20      10001436        .       A       AGG     .       PASS    .       GT:GQ   1/1:30
+

Batch 2:

+
##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000211        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000598        .       T       A       .       PASS    .       GT:GQ   1/1:30
+20      10001436        .       A       AGGCT   .       PASS    .       GT:GQ   1/1:30
+

In order to merge these batches, I need to make a variety of bookkeeping and filtering decisions, as outlined in the merged VCF below:

+

Master VCF:

+
20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30  [pass in both]
+20      10000000        .       T       G       .       PASS    .       GT:GQ   0/1:30  [only in batch 1]
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30  [fail in both]
+20      10000211        .       C       T       .       FAIL    .       GT:GQ   0/1:30  [pass in 1, fail in 2, choice in unclear]
+20      10000598        .       T       A       .       PASS    .       GT:GQ   1/1:30  [only in batch 2]
+20      10001436        .       A       AGGCT   .       PASS    .       GT:GQ   1/1:30  [A/AGG in batch 1, A/AGGCT in batch 2, including this site may be problematic]
+

These issues fall into the following categories:

+ +

There are two difficult situations that must be addressed by the needs of the project merging batches:

+ +

Unfortunately, we cannot determine which is actually the correct choice, especially given the goals of the project. We leave it up the project bioinformatician to handle these cases when creating the master VCF. We are hopeful that at some point in the future we'll have a consensus approach to handle such merging, but until then this will be a manual process.

+

The GATK tool CombineVariants can be used to merge multiple VCF files, and parameter choices will allow you to handle some of the above issues. With tools like SelectVariants one can slice-and-dice the merged VCFs to handle these complexities as appropriate for your project's needs. For example, the above master merge can be produced with the following CombineVariants:

+
java -jar dist/GenomeAnalysisTK.jar \
+-T CombineVariants \
+-R human_g1k_v37.fasta \
+-V:one,VCF combine.1.vcf -V:two,VCF combine.2.vcf \
+--sites_only \
+-minimalVCF \
+-o master.vcf
+

producing the following VCF:

+
##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+20      9999996     .       A       ACT         .       PASS    set=Intersection
+20      10000000        .       T       G           .   PASS    set=one
+20      10000117        .       C       T           .       FAIL    set=FilteredInAll
+20      10000211        .       C       T           .       PASS    set=filterIntwo-one
+20      10000598        .       T       A           .       PASS    set=two
+20      10001436        .       A       AGG,AGGCT       .       PASS    set=Intersection
+

Genotyping your samples at these sites

+

Having created the master set of sites to genotype, along with their alleles, as in the previous section, you now use the UnifiedGenotyper to genotype each sample independently at the master set of sites. This GENOTYPE_GIVEN_ALLELES mode of the UnifiedGenotyper will jump into the sample BAM file, and calculate the genotype and genotype likelihoods of the sample at the site for each of the genotypes available for the REF and ALT alleles. For example, for site 10000211, the UnifiedGenotyper would evaluate the likelihoods of the CC, CT, and TT genotypes for the sample at this site, choose the most likely configuration, and generate a VCF record containing the genotype call and the likelihoods for the three genotype configurations.

+

As a concrete example command line, you can genotype the master.vcf file using in the bundle sample NA12878 with the following command:

+
java -Xmx2g -jar dist/GenomeAnalysisTK.jar \
+-T UnifiedGenotyper \
+-R bundle/b37/human_g1k_v37.fasta \
+-I bundle/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam \
+-alleles master.vcf \
+-L master.vcf \
+-gt_mode GENOTYPE_GIVEN_ALLELES \
+-out_mode EMIT_ALL_SITES \
+-stand_call_conf 0.0 \
+-glm BOTH \
+-G none \
+

The -L master.vcf argument tells the UG to only genotype the sites in the master file. If you don't specify this, the UG will genotype the master sites in GGA mode, but it will also genotype all other sites in the genome in regular mode.

+

The last item,-G ` prevents the UG from computing annotations you don't need. This command produces something like the following output:

+
##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+20      9999996     .       A       ACT         4576.19 .       .   GT:DP:GQ:PL     1/1:76:99:4576,229,0
+20      10000000        .       T       G           0       .       .       GT:DP:GQ:PL     0/0:79:99:0,238,3093
+20      10000211        .       C       T       857.79  .       .   GT:AD:DP:GQ:PL  0/1:28,27:55:99:888,0,870
+20      10000598        .       T       A           1800.57 .       .   GT:AD:DP:GQ:PL  1/1:0,48:48:99:1834,144,0
+20      10001436        .       A       AGG,AGGCT       1921.12 .       .   GT:DP:GQ:PL     0/2:49:84.06:1960,2065,0,2695,222,84
+

Several things should be noted here:

+ +

This genotyping command can be performed independently per sample, and so can be parallelized easily on a farm with one job per sample, as in the following:

+
foreach sample in samples:
+  run UnifiedGenotyper command above with -I $sample.bam -o $sample.vcf
+end
+

(Optional) Merging the sample VCFs together

+

You can use a similar command for CombineVariants above to merge back together all of your single sample genotyping runs. Suppose all of my UnifiedGenotyper jobs have completed, and I have VCF files named sample1.vcf, sample2.vcf, to sampleN.vcf. The single command:

+
java -jar dist/GenomeAnalysisTK.jar -T CombineVariants -R human_g1k_v37.fasta -V:sample1 sample1.vcf -V:sample2 sample2.vcf [repeat until] -V:sampleN sampleN.vcf -o combined.vcf
+

General notes

+ \ No newline at end of file diff --git a/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md b/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md new file mode 100644 index 000000000..6f634a9a8 --- /dev/null +++ b/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md @@ -0,0 +1,5 @@ +## Moved: (How to) Create a snippet of reads corresponding to a genomic interval + +http://gatkforums.broadinstitute.org/gatk/discussion/6530/moved-how-to-create-a-snippet-of-reads-corresponding-to-a-genomic-interval + +This discussion has been moved. \ No newline at end of file diff --git a/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md b/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md new file mode 100644 index 000000000..f38c5cab8 --- /dev/null +++ b/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md @@ -0,0 +1,5 @@ +## Moved: (How to) Efficiently map and clean up short read sequence data + +http://gatkforums.broadinstitute.org/gatk/discussion/6573/moved-how-to-efficiently-map-and-clean-up-short-read-sequence-data + +This discussion has been moved. \ No newline at end of file diff --git a/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md b/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md new file mode 100644 index 000000000..40d3ff8f8 --- /dev/null +++ b/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md @@ -0,0 +1,5 @@ +## Moved: (How to) Generate an unmapped BAM from FASTQ or aligned BAM + +http://gatkforums.broadinstitute.org/gatk/discussion/6538/moved-how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam + +This discussion has been moved. \ No newline at end of file diff --git a/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md b/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md new file mode 100644 index 000000000..edb2daa92 --- /dev/null +++ b/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md @@ -0,0 +1,5 @@ +## Moved: (How to) Mark duplicates with MarkDuplicates or MarkDuplicatesWithMateCigar + +http://gatkforums.broadinstitute.org/gatk/discussion/6873/moved-how-to-mark-duplicates-with-markduplicates-or-markduplicateswithmatecigar + +This discussion has been moved. \ No newline at end of file diff --git a/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md b/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md new file mode 100644 index 000000000..8c41832bb --- /dev/null +++ b/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md @@ -0,0 +1,5 @@ +## Moved: (How to) Visualize an alignment with IGV + +http://gatkforums.broadinstitute.org/gatk/discussion/6606/moved-how-to-visualize-an-alignment-with-igv + +This discussion has been moved. \ No newline at end of file diff --git a/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md b/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md new file mode 100644 index 000000000..d53665248 --- /dev/null +++ b/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md @@ -0,0 +1,50 @@ +## Per-base alignment qualities (BAQ) in the GATK + +http://gatkforums.broadinstitute.org/gatk/discussion/1326/per-base-alignment-qualities-baq-in-the-gatk + +

This article is out of date and no longer applicable. BAQs are no longer used in GATK.

+
+

1. Introduction

+

The GATK provides an implementation of the Per-Base Alignment Qualities (BAQ) developed by Heng Li in late 2010. See this SamTools page for more details.

+
+

2. Using BAQ

+

The BAQ algorithm is applied by the GATK engine itself, which means that all GATK walkers can potentially benefit from it. By default, BAQ is OFF, meaning that the engine will not use BAQ quality scores at all.

+

The GATK engine accepts the argument -baq with the following enum values:

+
public enum CalculationMode {
+    OFF,                        // don't apply a BAQ at all, the default
+    CALCULATE_AS_NECESSARY,     // do HMM BAQ calculation on the fly, as necessary, if there's no tag
+    RECALCULATE                 // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
+}
+

If you want to enable BAQ, the usual thing to do is CALCULATE_AS_NECESSARY, which will calculate BAQ values if they are not in the BQ read tag. If your reads are already tagged with BQ values, then the GATK will use those. RECALCULATE will always recalculate the BAQ, regardless of the tag, which is useful if you are experimenting with the gap open penalty (see below).

+

If you are really an expert, the GATK allows you to specify the BAQ gap open penalty (-baqGOP) to use in the HMM. This value should be 40 by default, a good value for whole genomes and exomes for highly sensitive calls. However, if you are analyzing exome data only, you may want to use 30, which seems to result in more specific call set. We continue to play with these values some. Some walkers, where BAQ would corrupt their analyses, forbid the use of BAQ and will throw an exception if -baq is provided.

+
+

3. Some example uses of the BAQ in the GATK

+ +

Note that some tools should not have BAQ applied to them.

+

This last option will be a particularly useful for people who are already doing base quality score recalibration. Suppose I have a pipeline that does:

+
RealignerTargetCreator
+IndelRealigner
+
+BaseRecalibrator
+PrintReads (with --BQSR input)
+
+UnifiedGenotyper
+

A highly efficient BAQ extended pipeline would look like

+
RealignerTargetCreator
+IndelRealigner // don't bother with BAQ here, since we will calculate it in table recalibrator
+
+BaseRecalibrator
+PrintReads (with --BQSR input) -baq RECALCULATE // now the reads will have a BAQ tag added.  Slows the tool down some
+
+UnifiedGenotyper -baq CALCULATE_AS_NECESSARY // UG will use the tags from TableRecalibrate, keeping UG fast
+
+

4. BAQ and walker control

+

Walkers can control via the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.

\ No newline at end of file diff --git a/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md b/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md new file mode 100644 index 000000000..4189000e9 --- /dev/null +++ b/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md @@ -0,0 +1,90 @@ +## Statistical methods used by GATK tools + +http://gatkforums.broadinstitute.org/gatk/discussion/4732/statistical-methods-used-by-gatk-tools + +

This document is out of date; see individual method documents in the Methods and Algorithms section.

+

List of documented methods below

+ +
+

Inbreeding Coefficient

+

Overview

+

Although the name Inbreeding Coefficient suggests it is a measure of inbreeding, Inbreeding Coefficient measures the excess heterozygosity at a variant site. It can be used as a proxy for poor mapping (sites that have high Inbreeding Coefficients are typically locations in the genome where the mapping is bad and reads that are in the region mismatch the region because they belong elsewhere). At least 10 samples are required (preferably many more) in order for this annotation to be calculated properly.

+

Theory

+

The Wikipedia article about Hardy-Weinberg principle includes some very helpful information on the theoretical underpinnings of the test, as Inbreeding Coefficient relies on the math behind the Hardy-Weinberg Principle.

+

Use in GATK

+

We calculate Inbreeding Coefficient as 1-(# observed heterozygotes)/(# expected heterozygotes). The number of observed heterozygotes can be calculated from the data. The number of expected heterozygotes is 2pq, where p is the frequency of the reference allele and q is the frequency of the alternate allele (AF). (Please see Hardy-Weinberg Principle link above). A value of 0 suggests the site is in Hardy-Weinberg Equilibrium. Negative values of Inbreeding Coefficient could mean there are too many heterozygotes and suggest a site with bad mapping. The other nice side effect is that one of the error modes in variant calling is for all calls to be heterozygous, which this metric captures nicely. This is why we recommend filtering out variants with negative Inbreeding Coefficients. Although positive values suggest too few heterozygotes, we do not recommend filtering out positive values because they could arise from admixture of different ethnic populations.

+

Please note: Inbreeding Coefficient is not really robust to the assumption of being unrelated. We have found that relatedness does break down the assumptions Inbreeding Coefficient is based on. For family samples, it really depends on how many families and samples you have. For example, if you have 3 families, inbreeding coefficient is not going to work. But, if you have 10,000 samples and just a few families, it should be fine. Also, if you pass in a pedigree file (*.ped), it will use that information to calculate Inbreeding Coefficient only using the founders (i.e. individuals whose parents aren't in the callset), and as long as there are >= 10 of those, the data should be pretty good.

+

Example: Inbreeding Coefficient

+

In this example, lets say we are working with 100 human samples, and we are trying to calculate Inbreeding Coefficient at a site that has A for the reference allele and T for the alternate allele.

+

Step 1: Count the number of samples that have each genotype (hom-ref, het, hom-var)

+

A/A (hom-ref): 51 +A/T (het): 11 +T/T (hom-var): 38

+

Step 2: Get all necessary information to solve equation

+

We need to find the # observed hets and # expected hets.

+

number of observed hets = 11 (from number of observed A/T given above)

+

number of expected hets = 2pq * total genotypes (2pq is frequency of heterozygotes according to Hardy-Weinberg Equilibrium. We need to multiply that frequency by the number of all genotypes in the population to get the expected number of heterozygotes.)

+

p = frequency of ref allele = (# ref alleles)/(total # alleles) = (2 51 + 11)/(2 51 + 11 2 + 38 2) = 113/200 = 0.565 +q = frequency of alt allele = (# alt alleles)/(total # alleles) = (2 38 + 11)/(2 51 + 11 2 + 38 2) = 87/200 = 0.435

+

Remember that homozygous genotypes have two copies of the allele of interest (because we're assuming diploid.)

+

number of expected hets = 2pq 100 = 2 0.565 0.435 100 = 49.155

+

Step 3: Plug in the Numbers

+

Inbreeding Coefficient = 1 - (# observed hets)/(#expected hets) = 1 - (11/49.155) = 0.776

+

Step 4: Interpret the output

+

Our Inbreeding Coefficient is 0.776. Because it is a positive number, we can see there are fewer than the expected number of heterozygotes according to the Hardy-Weinberg Principle. Too few heterozygotes can imply inbreeding. However, we do not recommend filtering this site out because there may be a mixture of ethnicities in the cohort, and some ethnicities may be hom-ref while others are hom-var.

+

Rank Sum Test

+

Overview

+

The Rank Sum Test, also known as Mann-Whitney-Wilcoxon U-test after its developers (who are variously credited in subsets and in different orders depending on the sources you read) is a statistical test that aims to determine whether there is significant difference in the values of two populations of data.

+

Theory

+

The Wikipedia article about the Rank Sum Test includes some very helpful information on the theoretical underpinnings of the test, as well as various examples of how it can be applied.

+

Use in GATK

+

This test is used by several GATK annotations, including two standard annotations that are used for variant recalibration in the Best Practices: MappingQualityRankSum and ReadPosRankSum. In all cases, the idea is to check, for a given candidate variant, whether the properties of the data that support the reference allele are similar to those of the data that support a variant allele. If they are not similar, we conclude that there may be some technical bias and that the candidate variant may be an artifact.

+

Example: BaseQualityRankSumTest

+

Note: this example applies Method 2 from the Wikipedia article linked above.

+

In this example, we have a set of 20 reads, 10 of which support the reference allele and 10 of which support the alternate allele. At first glance, that looks like a clear heterozygous 0/1 site. But to be thorough in our analysis and to account for any technical bias, we want to determine if there is a significant difference in the base qualities of the bases that support the reference allele vs. the bases that support the alternate allele.

+

Before we proceed, we must define our null hypothesis and alternate hypothesis.

+

-Null hypothesis: There is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.

+

-Alternate hypothesis: There is a difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.

+

Step 1: List the relevant observations

+

Reference allele base qualities: 20, 25, 26, 30, 32, 40, 47, 50, 53, 60 +Alternate allele base qualities: 0, 7, 10, 17, 20, 21, 30, 34, 40, 45

+

Step 2: Rank the observations

+

First, we arrange all the observations (base qualities) into a list of values ordered from lowest to highest (reference bases are in bold).

+

0, 7, 10, 17, 20, 20, 21, 25, 26, 30, 30, 32, 34, 40, 40, 45, 47, 50, 53, 60

+

Next we determine the ranks of the values. Since there are 20 observations (the base qualities), we have 20 ranks to assign. Whenever there are ties between observations for the rank, we take the rank to be equal to the midpoint of the ranks. For example, for 20(ref) and 20(alt), we have a tie in values, so we assign each observation a rank of (5+6)/2 = 5.5.

+

The ranks from the above list are (reference ranks are in bold):

+

1, 2, 3, 4, 5.5, 5.5, 7, 8, 9, 10.5, 10.5, 12, 13, 14.5, 14.5, 16, 17, 18, 19, 20

+

Step 3: Add up the ranks for each group

+

We now need to add up the ranks for the base qualities that came from the reference allele and the alternate allele.

+

$$ Rank_{ref} = 133.5 $$

+

$$ Rank_{alt} = 76.5 $$

+

Step 4: Calculate U for each group

+

U is a statistic that tells us the difference between the two rank totals. We can use the U statistic to calculate the z-score (explained below), which will give us our p-value.

+

Calculate U for each group (n = number of observations in each sample)

+

$$ U{ref} = \frac{ n{ref} n{alt} + n{ref} (n{ref}+ 1) }{ 2 } - Rank{ref} $$

+

$$ U{alt} = \frac{ n{alt} n{ref} + n{alt} (n{alt} + 1) }{ 2 } - Rank{alt} $$

+

$$ U_{ref} = \frac{ 10 10 + 10 11 }{ 2 } - 133.5 = 21.5 $$

+

$$ U_{alt} = \frac{ 10 10 + 10 11 }{ 2 } - 76.5 = 78.5 $$

+

Step 5: Calculate the overall z-score

+

Next, we need to calculate the z-score which will allow us to get the p-value. The z-score is a normalized score that allows us to compare the probability of the U score occurring in our distribution. +https://statistics.laerd.com/statistical-guides/standard-score.php

+

The equation to get the z-score is:

+

$$ z = \frac{U - mu}{u} $$

+

Breaking this equation down:

+

$$ z = z-score $$

+

$$ U = \text{lowest of the U scores calculated in previous steps} $$

+

$$ mu = \text{mean of the U scores above} = \frac{ n{ref} * n{alt} }{ 2 } $$

+

$$ u = \text{standard deviation of U} = \sqrt{ \frac{n{ref} * n{alt} * (n{ref} + n{alt} + 1) }{ 12 } } $$

+

To calculate our z:

+

$$ U = 21.5 $$

+

$$ mu = \frac{10 * 10 }{ 2 } = 50 $$

+

$$ u = \sqrt{ \frac{10 10 (10 + 10 + 1) }{ 12 } } = 13.229 $$

+

So altogether we have:

+

$$ z = \frac{ 21.5 - 50 }{ 13.229 } = -2.154 $$

+

Step 6: Calculate and interpret the p-value

+

The p-value is the probability of obtaining a z-score at least as extreme as the one we got, assuming the null hypothesis is true. In our example, the p-value gives us the probability that there is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele. The lower the p-value, the less likely it is that there is no difference in the base qualities.

+

Going to the z-score table, or just using a p-value calculator, we find the p-value to be 0.0312.

+

This means there is a .0312 chance that the base quality scores of the reference allele and alternate allele are the same. Assuming a p-value cutoff of 0.05, meaning there is less than 5% chance there is no difference in the two groups, and greater than or equal to 95% chance that there is a difference between the two groups, we have enough evidence to reject our null hypothesis that there is no difference in the base qualities of the reference and alternate allele. This indicates there is some bias and that the alternate allele is less well supported by the data than the allele counts suggest.

\ No newline at end of file diff --git a/doc_archive/deprecated/Using_Variant_Annotator.md b/doc_archive/deprecated/Using_Variant_Annotator.md new file mode 100644 index 000000000..141d969b4 --- /dev/null +++ b/doc_archive/deprecated/Using_Variant_Annotator.md @@ -0,0 +1,30 @@ +## Using Variant Annotator + +http://gatkforums.broadinstitute.org/gatk/discussion/49/using-variant-annotator + +

This document is out of date and has been retired. Please see the Annotation documentation in the Tool Docs as well as various other Guide articles for better materials on annotating variants.

+
+

2 SNPs with significant strand bias

+ +

Several SNPs with excessive coverage

+ +

For a complete, detailed argument reference, refer to the GATK document page here.

+

Introduction

+

In addition to true variation, variant callers emit a number of false-positives. Some of these false-positives can be detected and rejected by various statistical tests. VariantAnnotator provides a way of annotating variant calls as preparation for executing these tests.

+

Description of the haplotype score annotation

+ +

Examples of Available Annotations

+

The list below is not comprehensive. Please use the --list argument to get a list of all possible annotations available. Also, see the FAQ article on understanding the Unified Genotyper's VCF files for a description of some of the more standard annotations.

+ +

Note that technically the VariantAnnotator does not require reads (from a BAM file) to run; if no reads are provided, only those Annotations which don't use reads (e.g. Chromosome Counts) will be added. But most Annotations do require reads. When running the tool we recommend that you add the -L argument with the variant rod to your command line for efficiency and speed.

\ No newline at end of file diff --git a/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md b/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md new file mode 100644 index 000000000..ad0c319b5 --- /dev/null +++ b/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md @@ -0,0 +1,54 @@ +## Walkthrough of the Oct 2013 GATK workshop hands-on session + +http://gatkforums.broadinstitute.org/gatk/discussion/3366/walkthrough-of-the-oct-2013-gatk-workshop-hands-on-session + +

Note: the exact data files we used in this tutorial are no longer available. However, you can use the files in the resource bundle to work through this tutorial. You may need to adapt the filenames accordingly.

+
+

Map and mark duplicates

+

http://gatkforums.broadinstitute.org/discussion/2799/howto-map-and-mark-duplicates

+

Starting with aligned (mapped) and deduplicated (dedupped) reads in .sam file to save time.

+

- Generate index

+

Create an index file to enable fast seeking through the file.

+
java -jar BuildBamIndex.jar I= dedupped_20.bam
+

- Prepare reference to work with GATK

+

http://gatkforums.broadinstitute.org/discussion/2798/howto-prepare-a-reference-for-use-with-bwa-and-gatk

+

Create a dictionary file and index for the reference.

+
java -jar CreateSequenceDictionary.jar R=human_b37_20.fasta O=human_b37_20.dict
+
+samtools faidx human_b37_20.fasta 
+
+

Getting to know GATK

+

- Run a simple walker: CountReads

+

Identify basic syntax, console output: version, command recap line, progress estimates, result if applicable.

+
java -jar GenomeAnalysisTK.jar -T CountReads -R human_b37_20.fasta -I dedupped_20.bam -L 20
+

- Add a filter to count how many duplicates were marked

+

Look at filtering summary.

+
java -jar GenomeAnalysisTK.jar -T CountReads -R human_b37_20.fasta -I dedupped_20.bam -L 20 -rf DuplicateRead
+

- Demonstrate how to select a subset of read data

+

This can come in handy for bug reports.

+
java -jar GenomeAnalysisTK.jar -T PrintReads -R human_b37_20.fasta -I dedupped_20.bam -L 20:10000000-11000000 -o snippet.bam
+

- Demonstrate the equivalent for variant calls

+

Refer to docs for many other capabilities including selecting by sample name, up to complex queries.

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R human_b37_20.fasta -V dbsnp_b37_20.vcf -o snippet.vcf -L 20:10000000-11000000
+
+

Back to data processing

+

- Realign around Indels

+

http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels

+
java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R human_b37_20.fasta -I dedupped_20.bam -known indels_b37_20.vcf -o target_intervals.list -L 20 
+
+java -jar GenomeAnalysisTK.jar -T IndelRealigner -R human_b37_20.fasta -I dedupped_20.bam -known indels_b37_20.vcf -targetIntervals target_intervals.list -o realigned_20.bam -L 20 
+

- Base recalibration

+

http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr

+
java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_b37_20.fasta -I realigned_20.bam -knownSites dbsnp_b37_20.vcf -knownSites indels_b37_20.vcf -o recal_20.table -L 20
+
+java -jar GenomeAnalysisTK.jar -T PrintReads -R human_b37_20.fasta -I realigned_20.bam -BQSR recal_20.table -o recal_20.bam -L 20
+
+java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_b37_20.fasta -I recalibrated_20.bam -knownSites dbsnp_b37_20.vcf -knownSites indels_b37_20.vcf -o post_recal_20.table -L 20
+
+java -jar GenomeAnalysisTK.jar -T AnalyzeCovariates -R human_b37_20.fasta -before recal_20.table -after post_recal_20.table -plots recalibration_plots.pdf -L 20 
+

- ReduceReads

+

http://gatkforums.broadinstitute.org/discussion/2802/howto-compress-read-data-with-reducereads

+
java -jar GenomeAnalysisTK.jar -T ReduceReads -R human_b37_20.fasta -I recalibrated_20.bam -o reduced_20.bam -L 20 
+

- HaplotypeCaller

+

http://gatkforums.broadinstitute.org/discussion/2803/howto-call-variants-on-a-diploid-genome-with-the-haplotypecaller

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I reduced_20.bam --genotyping_mode DISCOVERY -stand_emit_conf 10 -stand_call_conf 30 -o variants_20.vcf -L 20 
\ No newline at end of file diff --git a/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md b/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md new file mode 100644 index 000000000..0d9596f9b --- /dev/null +++ b/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md @@ -0,0 +1,16 @@ +## What is Firepony and what can I expect from it? + +http://gatkforums.broadinstitute.org/gatk/discussion/6019/what-is-firepony-and-what-can-i-expect-from-it + +

Firepony in a nutshell

+

Firepony is a base quality score recalibrator for aligned read data sets. It recalculates the quality scores for each nucleotide in a SAM/BAM file based on the original quality data generated by the sequencer plus the empirical data obtained by running alignment.

+

The algorithm is a re-engineering of the base quality score recalibrator in the Genome Analysis Toolkit. It generates identical results, but runs much faster.

+

Note that this tool was written by external collaborators of the GATK team and is their sole responsibility. To be clear, Firepony is not part of the official GATK software and is not tested/validated by the GATK developers. Use at your own risk.

+
+

How Firepony fits into your existing processing pipeline (workflow and command line usage)

+

Firepony is meant to be a drop-in replacement for the BQSR step in GATK. The output of Firepony is a table that can be used as input for the PrintReads tool in GATK.

+

Existing pipelines can be modified by replacing the BQSR step (i.e., running GATK with the -T BaseRecalibrator argument) with Firepony, as outlined in the accompanying documentation.

+
+

Technical requirements and expected performance

+

Firepony runs on Linux systems based on Intel CPUs with 64-bit support and at least 16GB of RAM. It can optionally make use of NVIDIA GPUs (Kepler class or higher with at least 4GB of memory) for higher performance.

+

Compared to GATK, Firepony runs anywhere from 5x to 12x faster, depending on the specific hardware and data set used. The output of Firepony is compatible with GATK, meaning it can be used by subsequent processing steps that rely on GATK.

\ No newline at end of file diff --git a/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md b/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md new file mode 100644 index 000000000..a9d4d9568 --- /dev/null +++ b/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md @@ -0,0 +1,25 @@ +## Where can I get more information about high-throughput sequencing concepts and terms? + +http://gatkforums.broadinstitute.org/gatk/discussion/1321/where-can-i-get-more-information-about-high-throughput-sequencing-concepts-and-terms + +

This article has been retired, as the resources it cites are somewhat out of date. For an introduction to GATK and sequence analysis, see the Best Practices section of the website, which contains a lot of intro-level information and references useful resources.

+

We know this field can be confusing or even overwhelming to newcomers, and getting to grips with a large and varied toolkit like the GATK can be a big challenge. We have produced a presentation that we hope will help you review all the background information that you need to know in order to use the GATK:

+ +

In addition, the following links feature a lot of useful educational material about concepts and terminology related to next-generation sequencing:

+ \ No newline at end of file diff --git a/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md b/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md new file mode 100644 index 000000000..ede52df1d --- /dev/null +++ b/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md @@ -0,0 +1,79 @@ +## Workshop walkthrough (Brussels 2014) + +http://gatkforums.broadinstitute.org/gatk/discussion/4327/workshop-walkthrough-brussels-2014 + +

Note: this is a walkthrough of a hands-on GATK tutorial given at the Royal Institute of Natural Sciences on June 26, 2014 in Brussels, Belgium. It is intended to be performed with version 3.1-2 of the GATK and the corresponding data bundle.

+

Data files

+

We start with a BAM file called "NA12878.wgs.1lib.bam" (along with its index, "NA12878.wgs.1lib.bai") containing Illumina sequence reads from our favorite test subject, NA12878, that have been mapped using BWA-mem and processed using Picard tools according to the instructions here:

+

http://www.broadinstitute.org/gatk/guide/article?id=2799

+

Note that this file only contains sequence for a small region of chromosome 20, in order to minimize the file size and speed up the processing steps, for demonstration purposes. Normally you would run the steps in this tutorial on the entire genome (or exome).

+

This subsetted file was prepared by extracting read group 20GAV.1 from the CEUTrio.HiSeq.WGS.b37.NA12878.bam that is available in our resource bundle, using the following command:

+
java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I CEUTrio.HiSeq.WGS.b37.NA12878.bam -o NA12878.wgs.1lib.bam -L 20 -rf SingleReadGroup -goodRG 20GAV.1
+

(We'll explain later in the tutorial how to use this kind of utility function to manipulate BAM files.)

+

We also have our human genome reference, called "human_g1k_v37.fasta", which has been prepared according to the instructions here:

+

http://www.broadinstitute.org/gatk/guide/article?id=2798

+

We will walk through both of these tutorials to explain the processing, but without actually running the steps to save time.

+

And finally we have a few resource files containing known variants (dbsnp, mills indels). These files are all available in the resource bundle on our FTP server. See here for access instructions:

+

http://www.broadinstitute.org/gatk/guide/article?id=1215

+
+

DAY 1

+

Prelude: BAM manipulation with Picard and Samtools

+

- Viewing a BAM file information

+

See also the Samtools docs:

+

http://samtools.sourceforge.net/samtools.shtml

+

- Reverting a BAM file

+

Clean the BAM we are using of previous GATK processing using this Picard command:

+
java -jar RevertSam.jar I=NA12878.wgs.1lib.bam O=aligned_reads_20.bam RESTORE_ORIGINAL_QUALITIES=true REMOVE_DUPLICATE_INFORMATION=true REMOVE_ALIGNMENT_INFORMATION=false SORT_ORDER=coordinate
+

Note that it is possible to revert the file to FastQ format by setting REMOVE_ALIGNMENT_INFORMATION=true, but this method leads to biases in the alignment process, so if you want to do that, the better method is to follow the instructions given here:

+

http://www.broadinstitute.org/gatk/guide/article?id=2908

+

See also the Picard docs:

+

http://picard.sourceforge.net/command-line-overview.shtml

+

Mark Duplicates

+

See penultimate step of http://www.broadinstitute.org/gatk/guide/article?id=2799

+

After a few minutes, the file (which we'll call "dedupped_20.bam") is ready for use with GATK.

+

Interlude: tour of the documentation, website, forum etc. Also show how to access the bundle on the FTP server with FileZilla.

+

Getting to know GATK

+

Before starting to run the GATK Best Practices, we are going to learn about the basic syntax of GATK, how the results are output, how to interpret error messages, and so on.

+

- Run a simple walker: CountReads

+

Identify basic syntax, console output: version, command recap line, progress estimates, result if applicable.

+
java -jar GenomeAnalysisTK.jar -T CountReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20
+

- Add a filter to count how many duplicates were marked

+

Look at the filtering summary.

+
java -jar GenomeAnalysisTK.jar -T CountReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20 -rf DuplicateRead
+

- Demonstrate how to select a subset of read data

+

This can come in handy for bug reports.

+
java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20:10000000-11000000 -o snippet.bam
+

Also show how a bug report should be formatted and submitted. See +http://www.broadinstitute.org/gatk/guide/article?id=1894

+

- Demonstrate the equivalent for variant calls

+

Refer to docs for many other capabilities including selecting by sample name, up to complex queries.

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R human_g1k_v37.fasta -V dbsnp_b37_20.vcf -o snippet.vcf -L 20:10000000-11000000
+

See http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html

+
+

GATK Best Practices for data processing (DNA seq)

+

These steps should typically be performed per lane of data. Here we are running the tools on a small slice of the data, to save time and disk space, but normally you would run on the entire genome or exome. This is especially important for BQSR, which does not work well on small amounts of data.

+

Now let's pick up where we left off after Marking Duplicates.

+

- Realign around Indels

+

See http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels

+
java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R human_g1k_v37.fasta -I dedupped_20.bam -known Mills_and_1000G_gold_standard.indels.b37 -o target_intervals.list -L 20:10000000-11000000 
+
+java -jar GenomeAnalysisTK.jar -T IndelRealigner -R human_g1k_v37.fasta -I dedupped_20.bam -known Mills_and_1000G_gold_standard.indels.b37.vcf -targetIntervals target_intervals.list -o realigned.bam -L 20:10000000-11000000 
+

- Base recalibration

+

See http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr

+
java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_g1k_v37.fasta -I realigned_20.bam -knownSites dbsnp_b37_20.vcf -knownSites Mills_and_1000G_gold_standard.indels.b37.vcf -o recal_20.table -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I realigned_20.bam -BQSR recal_20.table -o recal_20.bam -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_g1k_v37.fasta -I recalibrated_20.bam -knownSites dbsnp_b37_20.vcf -knownSites Mills_and_1000G_gold_standard.indels.b37.vcf -o post_recal_20.table -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T AnalyzeCovariates -R human_g1k_v37.fasta -before recal_20.table -after post_recal_20.table -plots recalibration_plots.pdf -L 20:10000000-11000000
+
+

GATK Best Practices for variant calling (DNA seq)

+

- Run HaplotypeCaller in regular mode

+

See http://www.broadinstitute.org/gatk/guide/article?id=2803

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_g1k_v37.fasta -I recal_20.bam -o raw_hc_20.vcf -L 20:10000000-11000000
+

Look at VCF in text and in IGV, compare with bam file.

+

- Run HaplotypeCaller in GVCF mode (banded and BP_RESOLUTION)

+

See http://www.broadinstitute.org/gatk/guide/article?id=3893

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_g1k_v37.fasta -I recal_20.bam -o raw_hc_20.g.vcf -L 20:10000000-11000000 --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000
+

Compare to regular VCF.

\ No newline at end of file diff --git a/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md b/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md new file mode 100644 index 000000000..859f92925 --- /dev/null +++ b/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md @@ -0,0 +1,476 @@ +## [How to] Generate a BAM for variant discovery (long) + +http://gatkforums.broadinstitute.org/gatk/discussion/5969/how-to-generate-a-bam-for-variant-discovery-long + +

This document is an archived rough draft of Tutorial#6483. Please use the public tutorial. If you are interested in aligning to GRCh38, then please refer to a separate tutorial, Tutorial#8017.

+
+

[work in progress--I am breaking this up into smaller chunks] + +This document in part replaces the previous post (howto) Revert a BAM file to FastQ format that uses HTSlib commands. The workflow assumes familiarity with the concepts given in Collected FAQs about BAM files.

+ +

We outline steps to preprocess Illumina and similar tech DNA sequence reads for use in GATK's variant discovery workflow. This preprocessing workflow involves marking adapter sequences using MarkIlluminaAdapters so they contribute minimally to alignments, alignment using the BWA aligner's maximal exact match (MEM) algorithm, and preserving and adjusting read and read meta data using MergeBamAlignment for consistency and comparability of downstream results with analyses from the Broad Institute. With the exception of BWA, we use the most current versions of tools as of this writing. The workflow results in an aligned BAM file with appropriate meta information that is ready for processing with MarkDuplicates.

+

This workflow applies to three common types of sequence read files: (A) aligned BAMs that need realignment, (B) FASTQ format data and (C) raw sequencing data in BAM format. If you have raw data in BAM format (C), given appropriate read group fields, you can start with step 2. The other two formats require conversion to unmapped BAM (uBAM). We use Picard's RevertSam to convert an aligned BAM (A) or Picard's FastqToSam to convert a FASTQ (B) to the uBAM.

+

We address options relevant to process reads extracted from an interval as well as options to process large files, in our case a ~150G file called Solexa-272222. The tutorial uses a smaller file of reads aligning to a genomic interval, called snippet derived from Solexa-272222, for faster processing. The example commands apply to the larger file. Some comments on the workflow:

+ +
+

The steps of the workflow are as follows.

+
    +
  1. Generate an unmapped BAM (uBAM) +(A) Convert the FASTQ to uBAM and add read group information using FastqToSam +(B1) [Optional] Extract reads in a genomic interval from aligned BAM +(B2) Convert aligned BAM to uBAM and discard problematic records using RevertSam
  2. +
  3. Mark adapter sequences using MarkIlluminaAdapters
  4. +
  5. Convert uBAM to FASTQ and assign adapter bases low qualities using SamToFastq
  6. +
  7. Align reads and flag secondary hits using BWA MEM
  8. +
  9. [Optional] Pipe steps 3 & 4 and collect alignment metrics
  10. +
  11. [Optional] Sort, index and convert alignment to a BAM using SortSam and visualize on IGV
  12. +
  13. Restore altered data and apply & adjust meta information using MergeBamAlignment
  14. +
+
+

+

1. Generate an unmapped BAM (uBAM)

+

The goal is to produce an unmapped BAM file with appropriate read group (@RG) information that differentiates not only samples, but also factors that contribute to technical artifacts. To see the read group information for a BAM file, use the following command.

+
samtools view -H Solexa-272222.bam | grep '@RG'
+

This prints the lines starting with @RG within the header. Our tutorial file's single @RG line is shown below. The file has the read group fields required by this workflow as well as extra fields for record keeping. Two read group fields, ID and PU, appropriately differentiate flow cell lane, marked by .2, a factor that contributes to batch effects.

+
@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI
+ +

If your sample collection's BAM files lack required fields or do not differentiate pertinent factors within the fields, use Picard's AddOrReplaceReadGroups to add or appropriately rename the read group fields.

+

Here we illustrate how to derive both ID and PU fields from query names. We break down the common portion of two different read query names from the tutorial file.

+
H0164ALXX140820:2:1101:10003:23460
+H0164ALXX140820:2:1101:15118:25288
+
+#Breaking down the common portion of the query names:
+H0164____________ # portion of @RG ID and PU fields indicating Illumina flow cell
+_____ALXX140820__ # portion of @RG PU field indicating barcode or index in a multiplexed run
+_______________:2 # portion of @RG ID and PU fields indicating flow cell lane
+
+

(A) Convert the FASTQ to uBAM and add read group information using FastqToSam

+

Picard's FastqToSam transforms a FASTQ file to unmapped BAM, requires two read group fields and makes optional specification of other read group fields. In the command below we note which fields are required for our workflow. All other read group fields are optional.

+
java -Xmx8G -jar /seq/software/picard/current/bin/picard.jar FastqToSam \
+    FASTQ=snippet_XT_interleaved.fq \ #our single tutorial file contains both reads in a pair 
+    OUTPUT=snippet_FastqToSam_PU.bam \
+    READ_GROUP_NAME=H0164.2 \ # required; changed from default of A
+    SAMPLE_NAME=NA12878 \ # required
+    LIBRARY_NAME=Solexa-272222 \ # required 
+    PLATFORM_UNIT=H0164ALXX140820.2 \ 
+    PLATFORM=illumina \ # recommended
+    SEQUENCING_CENTER=BI \ 
+    RUN_DATE=2014-08-20T00:00:00-0400
+

Some details on select parameters:

+ +
+

(B1) [Optional] Extract reads in a genomic interval from aligned BAM

+

We want to test our reversion process on a subset of the tutorial file before committing to reverting the entire BAM. This process requires the reads in the BAM to be aligned to a reference genome and produces a BAM containing reads from a genomic interval.

+
java -Xmx8G -jar /path/GenomeAnalysisTK.jar \
+    -T PrintReads \ 
+    -R /path/human_g1k_v37_decoy.fasta \
+    -L 10:90000000-100000000 \ # this is the retained interval
+    -I Solexa-272222.bam -o snippet.bam # snippet.bam is newly created
+ +
+

(B2) Convert aligned BAM to uBAM and discard problematic records using RevertSam

+

We use Picard's RevertSam to remove alignment information. The resulting unmapped BAM (uBAM) has two uses in this workflow: (1) for processing through the MarkIlluminaAdapters branch of the workflow, and (2) for application of read group, read sequence and other read meta information to the aligned read file in the MergeBamAlignment branch of the workflow. The RevertSam parameters we specify remove information pertaining to previous alignments including program group records and standard alignment flags and tags that would otherwise transfer over in the MergeBamAlignment step. We remove nonstandard alignment tags with the ATTRIBUTE_TO_CLEAR option. For example, we clear the XT tag using this option so that it is free for use by MarkIlluminaAdapters. Our settings also reset flags to unmapped values, e.g. 77 and 141 for paired reads. Additionally, we invoke the SANITIZE option to remove reads that cause problems for MarkIlluminaAdapters. Our tutorial's snippet requires such filtering while Solexa-272222 does not.

+

For our particular file, we use the following parameters.

+
java -Xmx8G -jar /path/picard.jar RevertSam \
+    I=snippet.bam \
+    O=snippet_revert.bam \
+    SANITIZE=true \ 
+    MAX_DISCARD_FRACTION=0.005 \ # informational; does not affect processing
+    ATTRIBUTE_TO_CLEAR=XT \
+    ATTRIBUTE_TO_CLEAR=XN \
+    ATTRIBUTE_TO_CLEAR=AS \ #Picard release of 9/2015 clears AS by default
+    ATTRIBUTE_TO_CLEAR=OC \
+    ATTRIBUTE_TO_CLEAR=OP \
+    SORT_ORDER=queryname \ #default
+    RESTORE_ORIGINAL_QUALITIES=true \ #default
+    REMOVE_DUPLICATE_INFORMATION=true \ #default
+    REMOVE_ALIGNMENT_INFORMATION=true #default
+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee # sets environmental variable for temporary directory
+

We change these settings for RevertSam:

+ +

Some comments on options kept at default:

+ +

For snippet.bam, SANITIZE removes 25,909 out of 2,735,539 (0.947%) reads, leaving us with 2,709,630 reads. The intact BAM retains all reads. The example shows a read pair before and after RevertSam.

+
#original BAM
+H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA :<<=>@AAB@AA@AA>6@@A:>,*@A@<@??@8?9>@==8?:?@?;?:><??@>==9?>8>@:?>>=>;<==>>;>?=?>>=<==>>=>9<=>??>?>;8>?><?<=:>>>;4>=>7=6>=>>=><;=;>===?=>=>>?9>>>>??==== MC:Z:60M91S MD:Z:151    PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:0  MQ:i:0  OQ:Z:<FJFFJJJJFJJJJJF7JJJ<F--JJJFJJJJ<J<FJFF<JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ<J7JJJJFJ<AFAJJJJJFJJJJJAJFJJAFFFFA    UQ:i:0  AS:i:151
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC :0;.=;8?7==?794<<;:>769=,<;0:=<0=:9===/,:-==29>;,5,98=599;<=########################################################################################### SA:Z:2,33141573,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:2  MQ:i:60 OQ:Z:<-<-FA<F<FJF<A7AFAAJ<<AA-FF-AJF-FA<AFF--A-FA7AJA-7-A<F7<<AFF###########################################################################################    UQ:i:49 AS:i:50
+
+#after RevertSam (step 1.B2)
+H0164ALXX140820:2:1101:10003:23460  77  *   0   0   *   *   0   0   TGAGCTGGAAAGATTGCTTTTGCCCTGAAGTCTGAGGCGGCAGTGAGCCATGACTGCACCACTGCATTCCAGCCTGGGTGACAGAACAAGACCTTGTCTCTTTAAAAGAGGAAAGAAAAGGGAAAGGGAAAGGGAAGGGGAAGGGGATGGG AFFFFAJJFJAJJJJJFJJJJJAFA<JFJJJJ7J<JJJFFJJJFJFJFJJJAFJJJJJJJFFJJJJFJFJJJJFJJFJJJJJFJJJJJAJJAJFAJFJJJFFJAJAJJJAJ<FFJF<J<JJJJFJJJ--F<JJJ7FJJJJJFJJJJFFJF< RG:Z:H0164.2
+H0164ALXX140820:2:1101:10003:23460  141 *   0   0   *   *   0   0   TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC <-<-FA<F<FJF<A7AFAAJ<<AA-FF-AJF-FA<AFF--A-FA7AJA-7-A<F7<<AFF########################################################################################### RG:Z:H0164.2
+

back to top

+
+

+

2. Mark adapter sequences using MarkIlluminaAdapters

+

Previously we cleared the XT tag from our BAM so Picard's MarkIlluminaAdapters can use it to mark adapter sequences. SamToFastq (step 4) will use these in turn to assign low base quality scores to the adapter bases, effectively removing their contribution to read alignment and alignment scoring metrics. For the tutorial data, adapter sequences have already been removed from the beginning of reads. We want to additionally effectively remove any adapter sequences at the ends of reads arising from read-through to adapters in read pairs with shorter inserts.

+
java -Xmx8G -jar /path/picard.jar MarkIlluminaAdapters \
+    I=snippet_revert.bam \
+    O=snippet_revertmark.bam \
+    M=snippet_revertmark.metrics.txt \ #naming required
+    TMP_DIR=/path/shlee # optional to process large files
+ +

The example shows a read pair marked with the XT tag by MarkIlluminaAdapters. This is a different pair than shown previously as H0164ALXX140820:2:1101:10003:23460 reads do not contain adapter sequence. The insert region sequences for the reads overlap by a length corresponding approximately to the XT tag value. The same read pair is shown after SamToFastq transformation, where adapter sequence base quality scores have been set to 2 (# symbol), and after MergeBamAlignment, which restores original base quality scores.

+
#after MarkIlluminaAdapters (step 2)
+H0164ALXX140820:2:1101:15118:25288  77  *   0   0   *   *   0   0   
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT 
+AFFFFFJJFJFAAJJFFJJFJFJ<FJJJJJJF<JJJFFJJAF7JJJAAF7AJJFJFJFFJ--A-FAJA-F<J7A--AFJ7AJ7AJ-FJ7-JJJ-F-J---7J---7FF-JAJJ<A7JFAFAA7--FF----AF-7<JF<JFA-7<F-FF-J RG:Z:H0164.2    XT:i:63
+H0164ALXX140820:2:1101:15118:25288  141 *   0   0   *   *   0   0   
+GTCATGGCTGGACGCAGTGGCTCATACCTGTAATCCCAGCACTTTTGGAGGCTGAGGCAGGTAGATCGGAAGCGCCTCGTGTAGGGAGAGAGGGTTAACAAAAATGTAGATACCGGAGGTCGCCGTAAAATAAAAAAGTAGCAAGGAGTAG 
+AAFFFJJJJJAJJJJJFJJJJ<JFJJJJJJJJFJJJJFJ<FJJJJAJJJJJJJJFJJJ7JJ--JJJ<J<-FJ7F--<-J7--7AJJA-J------J7F<-77--F--FFJ---J-J-J--A-7<<----J-7-J-FJ--J--FA####### RG:Z:H0164.2    XT:i:63
+
+#after SamToFastq (step 3)
+@H0164ALXX140820:2:1101:15118:25288/1
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT
++
+AFFFFFJJFJFAAJJFFJJFJFJ<FJJJJJJF<JJJFFJJAF7JJJAAF7AJJFJFJFFJ--#########################################################################################
+@H0164ALXX140820:2:1101:15118:25288/2
+GTCATGGCTGGACGCAGTGGCTCATACCTGTAATCCCAGCACTTTTGGAGGCTGAGGCAGGTAGATCGGAAGCGCCTCGTGTAGGGAGAGAGGGTTAACAAAAATGTAGATACCGGAGGTCGCCGTAAAATAAAAAAGTAGCAAGGAGTAG
++
+AAFFFJJJJJAJJJJJFJJJJ<JFJJJJJJJJFJJJJFJ<FJJJJAJJJJJJJJFJJJ7JJ-#########################################################################################
+
+#after MergeBamAlignment (step 7)
+H0164ALXX140820:2:1101:15118:25288  99  10  99151971    60  151M    =   99152350    440 
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT
+AFFFFFJJFJFAAJJFFJJFJFJ<FJJJJJJF<JJJFFJJAF7JJJAAF7AJJFJFJFFJ--A-FAJA-F<J7A--AFJ7AJ7AJ-FJ7-JJJ-F-J---7J---7FF-JAJJ<A7JFAFAA7--FF----AF-7<JF<JFA-7<F-FF-J MC:Z:90S61M MD:Z:74T10T3A37T23  PG:Z:bwamem RG:Z:H0164.2    NM:i:4  MQ:i:60 UQ:i:48 AS:i:131    XS:i:40
+H0164ALXX140820:2:1101:15118:25288  147 10  99152350    60  90S61M  =   99151971    -440
+CTACTCCTTGCTACTTTTTTATTTTACGGCGACCTCCGGTATCTACATTTTTGTTAACCCTCTCTCCCTACACGAGGCGCTTCCGATCTACCTGCCTCAGCCTCCAAAAGTGCTGGGATTACAGGTATGAGCCACTGCGTCCAGCCATGAC 
+#######AF--J--JF-J-7-J----<<7-A--J-J-J---JFF--F--77-<F7J------J-AJJA7--7J-<--F7JF-<J<JJJ--JJ7JJJFJJJJJJJJAJJJJF<JFJJJJFJJJJJJJJFJ<JJJJFJJJJJAJJJJJFFFAA MC:Z:151M   MD:Z:61 PG:Z:bwamem RG:Z:H0164.2    NM:i:0  MQ:i:60 UQ:i:0  AS:i:61 XS:i:50
+

Snippet_revertmark.bam marks 5,810 reads (0.21%) with XT, while Solexa-272222_revertmark.bam marks 3,236,552 reads (0.39%). We plot the metrics data using RStudio. +

+

back to top

+
+

+

3. Convert BAM to FASTQ using SamToFastq

+

Picard's SamToFastq takes read identifiers, read sequences, and base quality scores to write a Sanger FASTQ format file. We use additional options to effectively remove adapter sequences previously marked with the XT tag. All extant meta data, i.e. alignment information, flags and tags, are purged in this transformation.

+
java -Xmx8G -jar /path/picard.jar SamToFastq \
+    I=snippet_revertmark.bam \
+    FASTQ=snippet_XT_interleaved.fq \
+    CLIPPING_ATTRIBUTE=XT \
+    CLIPPING_ACTION=2 \
+    INTERLEAVE=true \ 
+    NON_PF=true \
+    TMP_DIR=/path/shlee # optional to process large files         
+ +

[Optional] Compress the FASTQ using gzip

+

This step is optional. The step is irrelevant if you pipe steps 3 and 4, as we outline in step 5.

+

BWA handles both FASTQ and gzipped FASTQ files natively--that is, BWA works on both file types directly. Thus, this step is optional. Compress the FASTQ file using the UNIX gzip utility.

+
gzip snippet_XT_interleaved.fq #replaces the file with snippet_XT_interleaved.fq.gz
+

back to top

+
+

+

4. Align reads and flag secondary hits using BWA MEM

+

GATK's variant discovery workflow recommends Burrows-Wheeler Aligner's maximal exact matches (BWA MEM) algorithm (Li 2013 reference; Li 2014 benchmarks; homepage; manual). BWA MEM is suitable for aligning high-quality long reads ranging from 70 bp to 1 Mbp against a large reference genome such as the human genome.

+ +

Index the reference genome file for BWA. Indexing is specific to algorithms. To index the human genome for BWA, we apply BWA's index function on the reference genome file, e.g. human_g1k_v37_decoy.fasta. This produces five index files with the extensions amb, ann, bwt, pac and sa.

+
bwa index -a bwtsw human_g1k_v37_decoy.fasta
+

Align using BWA MEM. The tool automatically locates the index files within the same folder as the reference FASTA file. In the alignment command, > denotes the aligned file.

+ +

We invoke three options in the command.

+ +

MarkDuplicates can directly process BWA's alignment, whether or not the alignment marks secondary hits. However, the point of this workflow is to take advantage of the features offered by MergeBamAlignment that allow for the scalable, lossless operating procedure practiced by Broad's Genomics Platform and to produce comparable metrics.

+

back to top

+
+

+

5. [Optional] Pipe steps 3 & 4 and collect alignment metrics

+

Piping processes saves time and space. Our tutorial's resulting SAM file is small enough to easily view, manipulate and store. For larger data, however, consider using Unix pipelines. Piping allows streaming data in the processor's input-output (I/O) device directly to the next process for efficient processing and storage. We recommend piping steps 3 and 4 so as to avoid rereading and storing the large intermediate FASTQ file.

+

You may additionally extend piping to include step 6's SortSam. Steps 3-4-6 are piped in the example command below to generate an aligned BAM file and index. [For the larger file, I couldn't pipe Step 7's MergeBamAlignment.]

+
#overview of command structure
+[step 3's SamToFastq] | [step 4's bwa mem] | [step 6's SortSam]
+
+#for our file  
+java -Xmx8G -jar /path/picard.jar SamToFastq I=snippet_revertmark.bam \
+    FASTQ=/dev/stdout \
+    CLIPPING_ATTRIBUTE=XT CLIPPING_ACTION=2 INTERLEAVE=true NON_PF=true \
+    TMP_DIR=/path/shlee | \ 
+    /path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta \
+    /dev/stdin | \  #to stop piping here, add '> snippet_piped.sam'
+    java -Xmx8G -jar /path/picard.jar SortSam \
+    INPUT=/dev/stdin \
+    OUTPUT=snippet_piped.bam \
+    SORT_ORDER=coordinate CREATE_INDEX=true \
+    TMP_DIR=/path/shlee
+

Calculate alignment metrics using Picard tools. Picard offers a variety of metrics collecting tools, e.g. CollectAlignmentSummaryMetrics, CollectWgsMetrics and CollectInsertSizeMetrics. Some tools give more detailed metrics if given the reference sequence. See Picard for metrics definitions. Metrics calculations will differ if run on the BAM directly from alignment (BWA) versus on the merged BAM (MergeBamAlignment). See [link--get from G] for guidelines on when to run tools.

+
java -Xmx8G -jar /path/picard.jar CollectAlignmentSummaryMetrics \
+    R=/path/Homo_sapiens_assembly19.fasta \
+    INPUT=slice.bam \
+    OUTPUT=slice_bam_metrics.txt \
+    TMP_DIR=/path/shlee # optional to process large files
+

For example, percent chimeras is a calculated metric. Our tutorial alignment of the whole data set gives 0.019% (BWA) or 0.0034% (MergeBamAlignment) chimeric paired reads. The genomic interval defined in step 1 reports 0.0032% chimeric paired reads. In contrast, the aligned snippet gives 0.0012% (BWA) or 0.00002% (MergeBamAlignment) chimeric paired reads. This illustrates in part the differences I alluded to at the beginning of step 4.

+

back to top

+
+

+

6. [Optional] Sort, index and convert alignment to a BAM using SortSam and visualize on IGV

+

Picard's SortSam sorts, indexes and converts between SAM and BAM formats. For file manipulations and to view aligned reads using the Integrative Genomics Viewer (IGV), the SAM or BAM file must be coordinate-sorted and indexed. Some Picard tools, such as MergeBamAlignment in step 7, by default coordinate sort and can use the standard CREATE_INDEX option. If you didn't create an index in step 7, or want to convert to BAM and index the alignment file from step 4, then use Picard's SortSam. The index file will have an sai or bai extension depending on the specified format.

+
java -Xmx8G -jar /path/picard.jar SortSam \
+    INPUT=Solexa-272222_markXT_aln.sam \ 
+    OUTPUT=Solexa-272222_markXT_aln.bam \ #extension here specifies format conversion
+    SORT_ORDER=coordinate \
+    CREATE_INDEX=true \ # a standard option for Picard commands
+    TMP_DIR=/path/shlee # optional to process large files
+

View aligned reads using the Integrative Genomics Viewer (IGV). Of the multiple IGV versions, the Java Web Start jnlp version allows the highest memory, as of this writing 10 GB for machines with 64-bit Java.

+ +

Here, IGV displays our example chimeric pair, H0164ALXX140820:2:1101:10003:23460 at its alignment loci. BWA's secondary alignment designation causes the mates on chromosome 10 to display as unpaired in IGV's paired view. MergeBamAlignment corrects for this when it switches the secondary alignment designation. Mates display as paired on chromosome 10.

+

Visualizing alignments in such a manner makes apparent certain convergent information. For example, we see that the chimeric region on chromosome 2 is a low complexity GC-rich region, apparent by the predominantly yellow coloring (representing guanine) of the reference region. We know there are many multimapping reads because reads with MAPQ score of zero are filled in white versus gray, and the region is down-sampled, as indicated by the underscoring in the log-scaled coverage chart. We can infer reads in this chromosome 2 region are poorly mapped based on the region's low complexity, depth of reads and prevalence of low MAPQ reads.

+ +

back to top

+
+

+

7. Restore altered data and apply & adjust meta information using MergeBamAlignment

+

Our alignment file lacks read group information and certain tags, such as the mate CIGAR (MC) tag. It has hard-clipped sequences and altered base qualities. The alignment also has some mapping artifacts we would like to correct for accounting congruency. Finally, the alignment records require coordinate sorting and indexing.

+

We use Picard's MergeBamAlignment to address all of these needs to produce a raw BAM file that is ready for GATK's variant discovery workflow. MergeBamAlignment takes metadata from a SAM or BAM file of unmapped reads (uBAM) and merges it with a SAM or BAM file containing alignment records for a subset of those reads. Metadata include read group information, read sequences, base quality scores and tags. The tool applies read group information from the uBAM and retains the program group information from the aligned file. In restoring original sequences, MergeBamAlignment adjusts CIGAR strings from hard-clipped to soft-clipped. The tool adjusts flag values, e.g. changes primary alignment designations according to a user-specified strategy, for desired congruency. Optional parameters allow introduction of additional metadata, e.g. user-specified program group information or nonstandard aligner-generated tags. If the alignment file is missing reads present in the unaligned file, these are retained as unaligned records. Finally, alignment records are coordinate sorted, meaning they are ordered by chromosomal mapping position.

+ +

A read with multiple alignment records may map to multiple loci or may be chimeric--that is, splits the alignment. It is possible for an aligner to produce multiple alignments as well as multiple primary alignments, e.g. in the case of a linear alignment set of split reads. When one alignment, or alignment set in the case of chimeric read records, is designated primary, others are designated either secondary or supplementary. Invoking the -M option, we had BWA mark the record with the longest aligning section of split reads as primary and all other records as secondary. MergeBamAlignment further adjusts this secondary designation and other flags, e.g. read mapped in proper pair and mate unmapped flags, to fix mapping artifacts. We only note some changes made by MergeBamAlignment to our tutorial data and by no means comprehensively list its features.

+
java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+    R=/path/Homo_sapiens_assembly19.fasta \ 
+    UNMAPPED_BAM=Solexa-272222_revertclean.bam \ 
+    ALIGNED_BAM=Solexa-272222_markXT_aln.sam \
+    O=Solexa-272222_merge_IGV_raw.bam \ #output file name in SAM or BAM format
+    CREATE_INDEX=true \ #standard option for any Picard command
+    ADD_MATE_CIGAR=true \ #default; adds MC tag
+    CLIP_ADAPTERS=false \ #changed from default
+    CLIP_OVERLAPPING_READS=true \ #default; soft-clips ends so mates do not overlap
+    INCLUDE_SECONDARY_ALIGNMENTS=true \ #default
+    MAX_INSERTIONS_OR_DELETIONS=-1 \ #changed to allow any number of insertions or deletions
+    PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ #changed from default BestMapq
+    ATTRIBUTES_TO_RETAIN=XS \ #specify multiple times to retain alignment tags starting with X, Y, or Z 
+    TMP_DIR=/path/shlee #optional to process large files
+

You need not invoke PROGRAM options as BWA's program group information is sufficient and transfer from the alignment during the merging. If, for whatever reason, you need to apply program group information by a different means, then use MergeBamAlignment to assign each of the following program group options. Example information is given.

+
    PROGRAM_RECORD_ID=bwa \
+    PROGRAM_GROUP_NAME=bwamem \
+    PROGRAM_GROUP_VERSION=0.7.7-r441 \
+    PROGRAM_GROUP_COMMAND_LINE='/path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta Solexa-272222_interleavedXT.fq > Solexa-272222_markXT_aln.sam' \ 
+

In the command, we change CLIP_ADAPTERS, MAX_INSERTIONS_OR_DELETIONS and PRIMARY_ALIGNMENT_STRATEGY values from default, and invoke other optional parameters.

+ +

Original base quality score restoration is illustrated in Step 3. The following example shows a read pair for which MergeBamAlignment adjusts multiple other information fields. The query name is listed thrice because we have paired reads where one of the reads has two alignment loci, on chromosome 2 and on chromosome 10. The mate is mapped with high MAPQ to chromosome 10. The two loci align 69 and 60 nucleotide regions, respectively, and the aligned regions coincide by 15 bases. A good portion of the chromosome 2 aligned region has low base quality scores. The NM tag indicates that the chromosome 2 alignment requires one change to match the reference, while the chromosome 10 read requires two changes and this is also reflected in the MD tags that provide the mismatching positions. When tallying alignment scores, given by the AS tag, aligners penalize mismatching positions, here apparently by five points per mismatch, e.g. 60 matches minus two mismatches multiplied by five gives an alignment score of 50. Both read records have values for the XS (suboptimal alignment score) and SA (chimeric alignment) tags that indicate a split read. Flag values, set by BWA, indicate the chromosome 2 record is primary and the chromosome 10 record is secondary.

+
#aligned reads from step 4
+H0164ALXX140820:2:1101:10003:23460  177 2   33141435    0   37S69M45S   10  91515318    0   
+GGGTGGGAGGGGGGGAGAGAGGGGTGGGAGAGGGGAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAAGGAAAGGAGGGAGGGAGGGAGCAAGGAAGGAAGGAAGGAAAGA ###########################################################################################FFA<<7F<A-7-AJA7AF-A--FFA<AF-FJA-FF-AA<<JAAFA7A<FJF<F<AF-<-< NM:i:1  MD:Z:51G17  AS:i:64 XS:i:64 SA:Z:10,91515130,+,60M91S,0,2;
+
+H0164ALXX140820:2:1101:10003:23460  417 10  91515130    0   60M91H  =   91515318    339 
+TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCC    <-<-FA<F<FJF<A7AFAAJ<<AA-FF-AJF-FA<AFF--A-FA7AJA-7-A<F7<<AFF    NM:i:2  MD:Z:48T4T6 AS:i:50 XS:i:36 SA:Z:2,33141435,-,37S69M45S,0,1;
+
+H0164ALXX140820:2:1101:10003:23460  113 10  91515318    60  151M    2   33141435    0
+CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA <FJFFJJJJFJJJJJF7JJJ<F--JJJFJJJJ<J<FJFF<JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ<J7JJJJFJ<AFAJJJJJFJJJJJAJFJJAFFFFA NM:i:0  MD:Z:151    AS:i:151    XS:i:0
+
+#after merging (step 7)
+H0164ALXX140820:2:1101:10003:23460  409 2   33141435    0   37S69M45S   =   33141435    0   
+GGGTGGGAGGGGGGGAGAGAGGGGTGGGAGAGGGGAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAAGGAAAGGAGGGAGGGAGGGAGCAAGGAAGGAAGGAAGGAAAGA ###########################################################################################FFA<<7F<A-7-AJA7AF-A--FFA<AF-FJA-FF-AA<<JAAFA7A<FJF<F<AF-<-< SA:Z:10,91515130,+,60M91S,0,2;  MD:Z:51G17  PG:Z:bwamem RG:Z:H0164.2    NM:i:1  UQ:i:2  AS:i:64 XS:i:64
+
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 
+TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC ###########################################################################################FFA<<7F<A-7-AJA7AF-A--FFA<AF-FJA-FF-AA<<JAAFA7A<FJF<F<AF-<-< SA:Z:2,33141435,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:bwamem RG:Z:H0164.2    NM:i:2  MQ:i:60 UQ:i:4  AS:i:50 XS:i:36
+
+H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    
+CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA <FJFFJJJJFJJJJJF7JJJ<F--JJJFJJJJ<J<FJFF<JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ<J7JJJJFJ<AFAJJJJJFJJJJJAJFJJAFFFFA MC:Z:60M91S MD:Z:151    PG:Z:bwamem RG:Z:H0164.2    NM:i:0  MQ:i:0  UQ:i:0  AS:i:151    XS:i:0
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
originalRevertSamBWA MEMMergeBamAlignment
RGRGRGread group
PGPGPGprogram group
OCoriginal cigar
XN# of ambiguous bases in ref
OPoriginal mapping position
SASASAchimeric alignment
MDMDMDstring for mismatching positions
NMNMNM# of mismatches
ASASASalignment score
UQUQPhred likelihood of the segment
MCMCCIGAR string for mate
MQMQmapping quality of the mate
OQoriginal base quality
XTtool specific
XSXSBWA's secondary alignment score
+ +

After merging our whole tutorial file, our unmapped read records increases by 620, from 5,334,323 to 5,334,943 due to changes in flag designations and not because any reads failed to map. Our total read records remains the same at 828,846,200 for our 819,728,254 original reads, giving ~1.11% multi-record reads.

+

back to top

+
\ No newline at end of file diff --git a/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md b/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md new file mode 100644 index 000000000..b8f621824 --- /dev/null +++ b/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md @@ -0,0 +1,28 @@ +## (howto) Set up remote debugging in IntelliJ + +http://gatkforums.broadinstitute.org/gatk/discussion/4712/howto-set-up-remote-debugging-in-intellij + +

Remote debugging is a powerful tool but requires a little bit of setup. Here is the 3-step process to an easier life.

+

1. Set up the remote config in IntelliJ

+

Do the following in IntelliJ:

+ +

2. Run the tool on gsa machine

+

Run the GATK command from the server with

+
+java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5-digit_port_number \
+     -jar _toolName_ \
+     args
+
+

GATK will wait for IntelliJ to actually start running.

+

3. Chase bug(s) in IntelliJ

+

Go to IntelliJ

+ +

Now chase.

+

You can also add the agentlib business as an alias in your .profile or .my.bashrc on the server like I did. Boom.

\ No newline at end of file diff --git a/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md b/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md new file mode 100644 index 000000000..10c9eca3d --- /dev/null +++ b/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md @@ -0,0 +1,31 @@ +## (howto) Speed up GATK compilation + +http://gatkforums.broadinstitute.org/gatk/discussion/5784/howto-speed-up-gatk-compilation + +
+

TL;DR: mvn -Ddisable.shadepackage verify

+
+

Background

+

In addition to Queue's GATK-wrapper codegen, relatively slow scala compilation, etc. there's still a lot of legacy compatibility from our ant days in the Maven scripts. Our mvn verify behaves more like when one runs ant, and builds everything needed to bundle the GATK.

+

As of GATK 3.4, by default the build for the "protected" code generates jar files that contains every class needed for running, one for the GATK and one for Queue. This is done by the Maven shade plugin, and are each called the "package jar". But, there's a way to generate a jar file that only contains META-INF/MANIFEST.MF pointers to the dependency jar files, instead of zipping/shading them up. These are each the "executable jar", and FYI are always generated as it takes seconds, not minutes.

+
+

Instructions for fast compilation

+

While developing and recompiling Queue, disable the shaded jar with -Ddisable.shadepackage. Then run java -jar target/executable/Queue.jar ... If you need to transfer this jar to another machine / directory, you can't copy (or rsync) just the jar, you'll need the entire executable directory.

+
# Total expected time, on a local disk, with Queue:
+#   ~5.0 min from clean
+#   ~1.5 min per recompile
+mvn -Ddisable.shadepackage verify
+
+# always available
+java -jar target/executable/Queue.jar --help
+
+# not found when shade disabled
+java -jar target/package/Queue.jar --help
+

If one is only developing for the GATK, skip Queue by adding -P\!queue also.

+
mvn -Ddisable.shadepackage -P\!queue verify
+
+# always available
+java -jar target/executable/GenomeAnalysisTK.jar --help
+
+# not found when queue profile disabled
+java -jar target/executable/Queue.jar --help
\ No newline at end of file diff --git a/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md b/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md new file mode 100644 index 000000000..89567d2bb --- /dev/null +++ b/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md @@ -0,0 +1,49 @@ +## Accessing reads: AlignmentContext and ReadBackedPileup + +http://gatkforums.broadinstitute.org/gatk/discussion/1322/accessing-reads-alignmentcontext-and-readbackedpileup + +

1. Introduction

+

The AlignmentContext and ReadBackedPileup work together to provide the read data associated with a given locus. This section details the tools the GATK provides for working with collections of aligned reads.

+

2. What are read backed pileups?

+

Read backed pileups are objects that contain all of the reads and their offsets that "pile up" at a locus on the genome. They are the basic input data for the GATK LocusWalkers, and underlie most of the locus-based analysis tools like the recalibrator and SNP caller. Unfortunately, there are many ways to view this data, and version one grew unwieldy trying to support all of these approaches. Version two of the ReadBackedPileup presents a consistent and clean interface for working pileup data, as well as supporting the iterable() interface to enable the convenient for ( PileupElement p : pileup ) for-each loop support.

+

3. How do I get a ReadBackedPileup and/or how do I create one?

+

The best way is simply to grab the pileup (the underlying representation of the locus data) from your AlignmentContext object in map:

+
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
+    ReadBackedPileup pileup = context.getPileup();
+

This aligns your calculations with the GATK core infrastructure, and avoids any unnecessary data copying from the engine to your walker.

+

If you are trying to create your own, the best constructor is:

+
public ReadBackedPileup(GenomeLoc loc, ArrayList<PileupElement> pileup )
+

requiring only a list, in order of read / offset in the pileup, of PileupElements.

+

From List and List

+

If you happen to have lists of SAMRecords and integer offsets into them you can construct a ReadBackedPileup this way:

+
public ReadBackedPileup(GenomeLoc loc, List<SAMRecord> reads, List<Integer> offsets )
+

4. What's the best way to use them?

+

Best way if you just need reads, bases and quals

+
for ( PileupElement p : pileup ) {
+  System.out.printf("%c %c %d%n", p.getBase(), p.getSecondBase(), p.getQual());
+  // you can get the read itself too using p.getRead()
+}
+

This is the most efficient way to get data, and should be used whenever possible.

+

I just want a vector of bases and quals

+

You can use:

+
public byte[] getBases()
+public byte[] getSecondaryBases()
+public byte[] getQuals()
+

To get the bases and quals as a byte[] array, which is the underlying base representation in the SAM-JDK.

+

All I care about are counts of bases

+

Use the follow function to get counts of A, C, G, T in order:

+
public int[] getBaseCounts()
+

Which returns a int[4] vector with counts according to BaseUtils.simpleBaseToBaseIndex for each base.

+

Can I view just the reads for a given sample, read group, or any other arbitrary filter?

+

The GATK can very efficiently stratify pileups by sample, and less efficiently stratify by read group, strand, mapping quality, base quality, or any arbitrary filter function. The sample-specific functions can be called as follows:

+
pileup.getSamples();
+pileup.getPileupForSample(String sampleName);
+

In addition to the rich set of filtering primitives built into the ReadBackedPileup, you can supply your own primitives by implmenting a PileupElementFilter:

+
public interface PileupElementFilter {
+    public boolean allow(final PileupElement pileupElement);
+}
+

and passing it to ReadBackedPileup's generic filter function:

+
public ReadBackedPileup getFilteredPileup(PileupElementFilter filter);
+

See the ReadBackedPileup's java documentation for a complete list of built-in filtering primitives.

+

Historical: StratifiedAlignmentContext

+

While ReadBackedPileup is the preferred mechanism for aligned reads, some walkers still use the StratifiedAlignmentContext to carve up selections of reads. If you find functions that you require in StratifiedAlignmentContext that seem to have no analog in ReadBackedPileup, please let us know and we'll port the required functions for you.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md b/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md new file mode 100644 index 000000000..0d192471d --- /dev/null +++ b/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md @@ -0,0 +1,45 @@ +## Adding and updating dependencies [RETIRED] + +http://gatkforums.broadinstitute.org/gatk/discussion/1352/adding-and-updating-dependencies-retired + +

Adding Third-party Dependencies

+

The GATK build system uses the Ivy dependency manager to make it easy for our users to add additional dependencies. Ivy can pull the latest jars and their dependencies from the Maven repository, making adding or updating a dependency as simple as adding a new line to the ivy.xml file.

+

If your tool is available in the maven repository, add a line to the ivy.xml file similar to the following:

+
<dependency org="junit" name="junit" rev="4.4" />
+

If you would like to add a dependency to a tool not available in the maven repository, please email gsahelp@broadinstitute.org

+

Updating SAM-JDK and Picard

+

Because we work so closely with the SAM-JDK/Picard team and are critically dependent on the code they produce, we have a special procedure for updating the SAM/Picard jars. Please use the following procedure to when updating sam-*.jar or picard-*.jar.

+ +

Updating the Picard public jars

+ +

Updating the Picard private jar

+ \ No newline at end of file diff --git a/doc_archive/developer-zone/Collecting_output.md b/doc_archive/developer-zone/Collecting_output.md new file mode 100644 index 000000000..a7d8a2389 --- /dev/null +++ b/doc_archive/developer-zone/Collecting_output.md @@ -0,0 +1,34 @@ +## Collecting output + +http://gatkforums.broadinstitute.org/gatk/discussion/1341/collecting-output + +

1. Analysis output overview

+

In theory, any class implementing the OutputStream interface. In practice, three types of classes are commonly used: PrintStreams for plain text files, SAMFileWriters for BAM files, and VCFWriters for VCF files.

+

2. PrintStream

+

To declare a basic PrintStream for output, use the following declaration syntax:

+
@Output
+public PrintStream out;
+

And use it just as you would any other PrintStream:

+
out.println("Hello, world!");
+

By default, @Output streams prepopulate fullName, shortName, required, and doc. required in this context means that the GATK will always fill in the contents of the out field for you. If the user specifies no --out command-line argument, the 'out' field will be prepopulated with a stream pointing to System.out.

+

If your walker outputs a custom format that requires more than simple concatenation by [Queue]() you should also implement a custom Gatherer.

+

3. SAMFileWriter

+

For some applications, you might need to manage their own SAM readers and writers directly from inside your walker. Current best practice for creating these Readers / Writers is to declare arguments of type SAMFileReader or SAMFileWriter as in the following example:

+
@Output
+SAMFileWriter outputBamFile = null;
+

If you do not specify the full name and short name, the writer will provide system default names for these arguments. Creating a SAMFileWriter in this way will create the type of writer most commonly used by members of the GSA group at the Broad Institute -- it will use the same header as the input BAM and require presorted data. To change either of these attributes, use the StingSAMIterator interface instead:

+
@Output
+StingSAMFileWriter outputBamFile = null;
+

and later, in initialize(), run one or both of the following methods:

+

outputBAMFile.writeHeader(customHeader); +outputBAMFile.setPresorted(false);

+

You can change the header or presorted state until the first alignment is written to the file.

+

4. VCFWriter

+

VCFWriter outputs behave similarly to PrintStreams and SAMFileWriters. Declare a VCFWriter as follows:

+

@Output(doc="File to which variants should be written",required=true) +protected VCFWriter writer = null;

+

5. Debugging Output

+

The walkers provide a protected logger instance. Users can adjust the debug level of the walkers using the -l command line option.

+

Turning on verbose logging can produce more output than is really necessary. To selectively turn on logging for a class or package, specify a log4j.properties property file from the command line as follows:

+
-Dlog4j.configuration=file:///<your development root>/Sting/java/config/log4j.properties
+

An example log4j.properties file is available in the java/config directory of the Git repository.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Documenting_walkers.md b/doc_archive/developer-zone/Documenting_walkers.md new file mode 100644 index 000000000..dd2d410bc --- /dev/null +++ b/doc_archive/developer-zone/Documenting_walkers.md @@ -0,0 +1,32 @@ +## Documenting walkers + +http://gatkforums.broadinstitute.org/gatk/discussion/1346/documenting-walkers + +

The GATK discovers walker documentation by reading it out of the Javadoc, Sun's design pattern for providing documentation for packages and classes. This page will provide an extremely brief explanation of how to write Javadoc; more information on writing javadoc comments can be found in Sun's documentation.

+

1. Adding walker and package descriptions to the help text

+

The GATK's build system uses the javadoc parser to extract the javadoc for classes and packages and embed the contents of that javadoc in the help system. If you add Javadoc to your package or walker, it will automatically appear in the help. The javadoc parser will pick up on 'standard' javadoc comments, such as the following, taken from PrintReadsWalker:

+
/**
+ * This walker prints out the input reads in SAM format.  Alternatively, the walker can write reads into a specified BAM file.
+ */
+

You can add javadoc to your package by creating a special file, package-info.java, in the package directory. This file should consist of the javadoc for your package plus a package descriptor line. One such example follows:

+
/**
+ * @help.display.name Miscellaneous walkers (experimental)
+ */
+package org.broadinstitute.sting.playground.gatk.walkers;
+

Additionally, the GATK provides two extra custom tags for overriding the information that ultimately makes it into the help.

+ +

2. Hiding experimental walkers (use sparingly, please!)

+

Walkers can be hidden from the documentation system by adding the @Hidden annotation to the top of each walker. @Hidden walkers can still be run from the command-line, but their documentation will not be visible to end users. Please use this functionality sparingly to avoid walkers with hidden command-line options that are required for production use.

+

3. Disabling building of help

+

Because the building of our help text is actually heavyweight and can dramatically increase compile time on some systems, we have a mechanism to disable help generation.

+

Compile with the following command:

+
ant -Ddisable.help=true
+

to disable generation of help.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md b/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md new file mode 100644 index 000000000..b99bfc2a1 --- /dev/null +++ b/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md @@ -0,0 +1,78 @@ +## Frequently asked questions about Scala + +http://gatkforums.broadinstitute.org/gatk/discussion/1315/frequently-asked-questions-about-scala + +

1. What is Scala?

+

Scala is a combination of an object oriented framework and a functional programming language. For a good introduction see the free online book Programming Scala.

+

The following are extremely brief answers to frequently asked questions about Scala which often pop up when first viewing or editing QScripts. For more information on Scala there a multitude of resources available around the web including the Scala home page and the online Scala Doc.

+

2. Where do I learn more about Scala?

+ +

3. What is the difference between var and val?

+

var is a value you can later modify, while val is similar to final in Java.

+

4. What is the difference between Scala collections and Java collections? / Why do I get the error: type mismatch?

+

Because the GATK and Queue are a mix of Scala and Java sometimes you'll run into problems when you need a Scala collection and instead a Java collection is returned.

+
   MyQScript.scala:39: error: type mismatch;
+     found   : java.util.List[java.lang.String]
+     required: scala.List[String]
+        val wrapped: List[String] = TextFormattingUtils.wordWrap(text, width)
+

Use the implicit definitions in JavaConversions to automatically convert the basic Java collections to and from Scala collections.

+
import collection.JavaConversions._
+

Scala has a very rich collections framework which you should take the time to enjoy. One of the first things you'll notice is that the default Scala collections are immutable, which means you should treat them as you would a String. When you want to 'modify' an immutable collection you need to capture the result of the operation, often assigning the result back to the original variable.

+
var str = "A"
+str + "B"
+println(str) // prints: A
+str += "C"
+println(str) // prints: AC
+
+var set = Set("A")
+set + "B"
+println(set) // prints: Set(A)
+set += "C"
+println(set) // prints: Set(A, C)
+

5. How do I append to a list?

+

Use the :+ operator for a single value.

+
  var myList = List.empty[String]
+  myList :+= "a"
+  myList :+= "b"
+  myList :+= "c"
+

Use ++ for appending a list.

+
  var myList = List.empty[String]
+  myList ++= List("a", "b", "c")
+

6. How do I add to a set?

+

Use the + operator.

+
  var mySet = Set.empty[String]
+  mySet += "a"
+  mySet += "b"
+  mySet += "c"
+

7. How do I add to a map?

+

Use the + and -> operators.

+
  var myMap = Map.empty[String,Int]
+  myMap += "a" -> 1
+  myMap += "b" -> 2
+  myMap += "c" -> 3
+

8. What are Option, Some, and None?

+

Option is a Scala generic type that can either be some generic value or None. Queue often uses it to represent primitives that may be null.

+
  var myNullableInt1: Option[Int] = Some(1)
+  var myNullableInt2: Option[Int] = None
+

9. What is _ / What is the underscore?

+

François Armand's slide deck is a good introduction: http://www.slideshare.net/normation/scala-dreaded

+

To quote from his slides:

+
Give me a variable name but
+- I don't care of what it is
+- and/or
+- don't want to pollute my namespace with it
+

10. How do I format a String?

+

Use the .format() method.

+

This Java snippet:

+
String formatted = String.format("%s %i", myString, myInt);
+

In Scala would be:

+
val formatted = "%s %i".format(myString, myInt)
+

11. Can I use Scala Enumerations as QScript @Arguments?

+

No. Currently Scala's Enumeration class does not interact with the Java reflection API in a way that could be used for Queue command line arguments. You can use Java enums if for example you are importing a Java based walker's enum type.

+

If/when we find a workaround for Queue we'll update this entry. In the meantime try using a String.

\ No newline at end of file diff --git a/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md b/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md new file mode 100644 index 000000000..d95303f7c --- /dev/null +++ b/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md @@ -0,0 +1,165 @@ +## GATK development process and coding standards + +http://gatkforums.broadinstitute.org/gatk/discussion/2129/gatk-development-process-and-coding-standards + +

Introduction

+

This document describes the current GATK coding standards for documentation and unit testing. The overall goal is that all functions be well documented, have unit tests, and conform to the coding conventions described in this guideline. It is primarily meant as an internal reference for team members, but we are making it public to provide an example of how we work. There are a few mentions of specific team member responsibilities and who to contact with questions; please just disregard those as they will not be applicable to you.

+

Coding conventions

+

General conventions

+

The Genome Analysis Toolkit generally follows Java coding standards and good practices, which can be viewed at Sun's site.

+

The original coding standard document for the GATK was developed in early 2009. It remains a reasonable starting point but may be superseded by statements on this page (available as a PDF).

+

Size of functions and functional programming style

+

Code in the GATK should be structured into clear, simple, and testable functions. Clear means that the function takes a limited number of arguments, most of which are values not modified, and in general should return newly allocated results, as opposed to directly modifying the input arguments (functional style). The max. size of functions should be approximately one screen's worth of real estate (no more than 80 lines), including inline comments. If you are writing functions that are much larger than this, you must refactor your code into modular components.

+

Code duplication

+

Do not duplicate code. If you are finding yourself wanting to make a copy of functionality, refactor the code you want to duplicate and enhance it. Duplicating code introduces bugs, makes the system harder to maintain, and will require more work since you will have a new function that must be tested, as opposed to expanding the tests on the existing functionality.

+

Documentation

+

Functions must be documented following the javadoc conventions. That means that the first line of the comment should be a simple statement of the purpose of the function. Following that is an expanded description of the function, such as edge case conditions, requirements on the argument, state changes, etc. Finally comes the @param and @return fields, that should describe the meaning of each function argument, restrictions on the values allowed or returned. In general, the return field should be about types and ranges of those values, not the meaning of the result, as this should be in the body of the documentation.

+

Testing for valid inputs and contracts

+

The GATK uses Contracts for Java to help us enforce code quality during testing. See CoFoJa for more information. If you've never programmed with contracts, read their excellent description Adding contracts to a stack. Contracts are only enabled when we are testing the code (unittests and integration tests) and not during normal execution, so contracts can be reasonably expensive to compute. They are best used to enforce assumptions about the status of class variables and return results.

+

Contracts are tricky when it comes to input arguments. The best practice is simple:

+ +

Below is an example private function that makes good use of input argument contracts:

+
/**
+ * Helper function to write out a IGV formatted line to out, at loc, with values
+ *
+ * http://www.broadinstitute.org/software/igv/IGV
+ *
+ * @param out a non-null PrintStream where we'll write our line
+ * @param loc the location of values
+ * @param featureName string name of this feature (see IGV format)
+ * @param values the floating point values to associate with loc and feature name in out
+ */
+@Requires({
+        "out != null",
+        "loc != null",
+        "values.length > 0"
+})
+private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) {
+    // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1
+    out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName);
+    for ( final double value : values )
+        out.print(String.format("\t%.3f", value));
+    out.println();
+} 
+

Final variables

+

Final java fields cannot be reassigned once set. Nearly all variables you write should be final, unless they are obviously accumulator results or other things you actually want to modify. Nearly all of your function arguments should be final. Being final stops incorrect reassigns (a major bug source) as well as more clearly captures the flow of information through the code.

+

An example high-quality GATK function

+
/**
+ * Get the reference bases from referenceReader spanned by the extended location of this active region,
+ * including additional padding bp on either side.  If this expanded region would exceed the boundaries
+ * of the active region's contig, the returned result will be truncated to only include on-genome reference
+ * bases
+ * @param referenceReader the source of the reference genome bases
+ * @param padding the padding, in BP, we want to add to either side of this active region extended region
+ * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for
+ * @return a non-null array of bytes holding the reference bases in referenceReader
+ */
+@Ensures("result != null")
+public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
+    if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
+    if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
+    if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");
+    if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc);
+
+    final byte[] reference =  referenceReader.getSubsequenceAt( genomeLoc.getContig(),
+            Math.max(1, genomeLoc.getStart() - padding),
+            Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
+
+    return reference;
+}
+

Unit testing

+

All classes and methods in the GATK should have unit tests to ensure that they work properly, and to protect yourself and others who may want to extend, modify, enhance, or optimize you code. That GATK development team assumes that anything that isn't unit tested is broken. Perhaps right now they aren't broken, but with a team of 10 people they will become broken soon if you don't ensure they are correct going forward with unit tests.

+

Walkers are a particularly complex issue. UnitTesting the map and reduce results is very hard, and in my view largely unnecessary. That said, you should write your walkers and supporting classes in such a way that all of the complex data processing functions are separated from the map and reduce functions, and those should be unit tested properly.

+

Code coverage tells you how much of your class, at the statement or function level, has unit testing coverage. The GATK development standard is to reach something >80% method coverage (and ideally >80% statement coverage). The target is flexible as some methods are trivial (they just call into another method) so perhaps don't need coverage. At the statement level, you get deducted from 100% for branches that check for things that perhaps you don't care about, such as illegal arguments, so reaching 100% statement level coverage is unrealistic for most clases.

+

You can find out more information about generating code coverage results at Analyzing coverage with clover

+

We've created a unit testing example template in the GATK codebase that provides examples of creating core GATK data structures from scratch for unit testing. The code is in class ExampleToCopyUnitTest and can be viewed here in github directly ExampleToCopyUnitTest.

+

The GSA-Workflow

+

As of GATK 2.5, we are moving to a full code review process, which has the following benefits:

+ +

The GSA workflow in words :

+ +

Example GSA workflow in the command line:

+
# starting a new feature
+git checkout -b rp_pairhmm_GSA-332
+git commit -av 
+git push -u origin rp_pairhmm_GSA-332
+
+# doing work on existing feature
+git commit -av
+git push
+
+# ready to submit pull-request
+git fetch origin
+git rebase -i origin/master
+git push -f
+
+# after being accepted, delete your branch
+git checkout master 
+git pull
+git branch -d rp_pairhmm_GSA-332
+(the reviewer will remove your github branch)
+

Commit histories and rebasing

+

You must commit your code in small commit blocks with commit messages that follow the git best practices, which require the first line of the commit to summarize the purpose of the commit, followed by -- lines that describe the changes in more detail. For example, here's a recent commit that meets this criteria that added unit tests to the GenomeLocParser:

+
Refactoring and unit testing GenomeLocParser
+
+-- Moved previously inner class to MRUCachingSAMSequenceDictionary, and unit test to 100% coverage
+-- Fully document all functions in GenomeLocParser
+-- Unit tests for things like parsePosition (shocking it wasn't tested!)
+-- Removed function to specifically create GenomeLocs for VariantContexts.  The fact that you must incorporate END attributes in the context means that createGenomeLoc(Feature) works correctly
+-- Depreciated (and moved functionality) of setStart, setStop, and incPos to GenomeLoc
+-- Unit test coverage at like 80%, moving to 100% with next commit
+

Now, git encourages you to commit code often, and develop your code in whatever order or what is best for you. So it's common to end up with 20 commits, all with strange, brief commit messages, that you want to push into the master branch. It is not acceptable to push such changes. You need to use the git command rebase to reorganize your commit history so satisfy the small number of clear commits with clear messages.

+

Here is a recommended git workflow using rebase:

+
    +
  1. +

    Start every project by creating a new branch for it. From your master branch, type the following command (replacing "myBranch" with an appropriate name for the new branch):

    +
    git checkout -b myBranch
    +

    Note that you only include the -b when you're first creating the branch. After a branch is already created, you can switch to it by typing the checkout command without the -b: "git checkout myBranch"

    +

    Also note that since you're always starting a new branch from master, you should keep your master branch up-to-date by occasionally doing a "git pull" while your master branch is checked out. You shouldn't do any actual work on your master branch, however.

    +
  2. +
  3. +

    When you want to update your branch with the latest commits from the central repo, type this while your branch is checked out:

    +
    git fetch && git rebase origin/master
    +

    If there are conflicts while updating your branch, git will tell you what additional commands to use.

    +

    If you need to combine or reorder your commits, add "-i" to the above command, like so:

    +
    git fetch && git rebase -i origin/master
    +

    If you want to edit your commits without also retrieving any new commits, omit the "git fetch" from the above command.

    +
  4. +
+

If you find the above commands cumbersome or hard to remember, create aliases for them using the following commands:

+
    git config --global alias.up '!git fetch && git rebase origin/master'
+    git config --global alias.edit '!git fetch && git rebase -i origin/master'
+    git config --global alias.done '!git push origin HEAD:master'
+

Then you can type "git up" to update your branch, "git edit" to combine/reorder commits, and "git done" to push your branch.

+

Here are more useful tutorials on how to use rebase:

+ +

If you need help with rebasing, talk to Mauricio or David and they will help you out.

\ No newline at end of file diff --git a/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md b/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md new file mode 100644 index 000000000..c1bdcf84d --- /dev/null +++ b/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md @@ -0,0 +1,13 @@ +## How to access the picard and htsjdk repository (containing samtools-jdk, tribble, and variant) + +http://gatkforums.broadinstitute.org/gatk/discussion/2194/how-to-access-the-picard-and-htsjdk-repository-containing-samtools-jdk-tribble-and-variant + +

The picard repository on github contains all picard public tools. Libraries live under the htsjdk, which includes the samtools-jdk, tribble, and variant packages (which includes VariantContext and associated classes as well as the VCF/BCF codecs).

+

If you just need to check out the sources and don't need to make any commits into the picard repository, the command is:

+
git clone https://github.com/broadinstitute/picard.git
+

Then within the picard directory, clone the htsjdk.

+
cd picard
+git clone https://github.com/samtools/htsjdk.git
+

Then you can attach the picard/src/java and picard/htsjdk/src/java directories in IntelliJ as a source directory (File -> Project Structure -> Libraries -> Click the plus sign -> "Attach Files or Directories" in the latest IntelliJ).

+

To build picard and the htsjdk all at once, type ant from within the picard directory. To run tests, type ant test

+

If you do need to make commits into the picard repository, first you'll need to create a github account, fork picard or htsjdk, make your changes, and then issue a pull request. For more info on pull requests, see: https://help.github.com/articles/using-pull-requests

\ No newline at end of file diff --git a/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md b/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md new file mode 100644 index 000000000..d5c0e7794 --- /dev/null +++ b/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md @@ -0,0 +1,41 @@ +## How to include GATK in a Maven project + +http://gatkforums.broadinstitute.org/gatk/discussion/6214/how-to-include-gatk-in-a-maven-project + +

GATK 3.x releases are not currently published to Central. But it is possible to install the GATK into your local repository, where Maven can then pick up the GATK as a dependency.

+
+

TL;DR Clone GATK 3.4, mvn install, then use the GATK as any other artifact.

+
+

The repository you should use depends on what is your goal.

+

If you want to build your own analysis tools on top of the GATK engine (not including the GATK analysis tools), with the option of distributing your project to others, you should clone the gatk repo.

+

If you want to integrate the full GATK into a project for in-house purposes (redistribution is not allowed under the licensing terms), in which your tools can call GATK tools directly internally, you should clone gatk-protected. This can be done by running the following code:

+
: 'GATK 3.4 code has known issues with the Java 8 compiler. Make sure you are using Java 7.'
+java -version
+
+: 'The entire GATK repo is relatively large. This only downloads 3.4.'
+git clone -b 3.4 --depth 1 git@github.com:broadgsa/gatk-protected.git gatk-protected-3.4
+cd gatk-protected-3.4
+
+: 'Install the gatk into a the local ~/.m2/repository, where your project can then refer to the GATK.'
+mvn install
+
+: 'Build the "external example" as a demo of using the GATK as a library.'
+cd public/external-example
+mvn verify
+java -jar target/external-example-1.0-SNAPSHOT.jar -T MyExampleWalker --help
+

After the GATK is installed, add this dependency to your Maven artifact, and all other GATK dependencies will be included as well.

+
<dependency>
+    <groupId>org.broadinstitute.gatk</groupId>
+    <artifactId>gatk-tools-protected</artifactId>
+    <version>3.4</version>
+</dependency>
+

One thing you might run into is that the GATK artifacts, and hence the external-example, transitively depend on artifacts that are also not in Central. They are instead committed under the path public/repo. Like in the public/external-example/pom.xml, your Maven project may need to include this directory as an additional repository. That being said mvn install should copy the artifacts to ~/.m2/repository for you. For example, after the install, you should have a directory ~/.m2/repository/com/google/code/cofoja/cofoja.

+

If you somehow need to add the GATK's public repo as a repository, use a repository element like the one below:

+
<repositories>
+    <repository>
+        <id>gatk.public.repo.local</id>
+        <name>GATK Public Local Repository</name>
+        <url>file:/Users/someuser/src/gatk-protected-3.4/public/repo</url>
+    </repository>
+</repositories>
+

Since the GATK is not in Central, each developer will need to install the GATK 3.4 once. Or, as an advanced step, your may also want to explore publishing the GATK on one of your shared local systems. If you have a shared filesystem you'd like to use as a repository, publish the GATK 3.4 to the directory using mvn install -Dmaven.repo.local=/mount/path/to/shared/repo, and then add a repository element to your Maven project. If your team is using a Maven repository such as Artifactory or Nexus, we can't provide guidance for publishing "third party" artifacts. But it should theoretically be possible, with instructions hopefully available through either Maven or the repository manager's help forums.

\ No newline at end of file diff --git a/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md b/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md new file mode 100644 index 000000000..a30c5db7e --- /dev/null +++ b/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md @@ -0,0 +1,36 @@ +## How to make a walker compatible with multi-threading + +http://gatkforums.broadinstitute.org/gatk/discussion/2867/how-to-make-a-walker-compatible-with-multi-threading + +

This document provides an overview of what are the steps required to make a walker multi-threadable using the -nct and the -nt arguments, which make use of the NanoSchedulable and TreeReducible interfaces, respectively.

+
+

NanoSchedulable / -nct

+

Providing -nct support requires that you certify that your walker's map() method is thread-safe -- eg., if any data structures are shared across map() calls, access to these must be properly synchronized. Once your map() method is thread-safe, you can implement the NanoSchedulable interface, an empty interface with no methods that just marks your walker as having a map() method that's safe to parallelize:

+
/**
+ * Root parallelism interface.  Walkers that implement this
+ * declare that their map function is thread-safe and so multiple
+ * map calls can be run in parallel in the same JVM instance.
+ */
+public interface NanoSchedulable {
+}
+
+

TreeReducible / -nt

+

Providing -nt support requires that both map() and reduce() be thread-safe, and you also need to implement the TreeReducible interface. Implementing TreeReducible requires you to write a treeReduce() method that tells the engine how to combine the results of multiple reduce() calls:

+
public interface TreeReducible<ReduceType> {
+    /**
+     * A composite, 'reduce of reduces' function.
+     * @param lhs 'left-most' portion of data in the composite reduce.
+     * @param rhs 'right-most' portion of data in the composite reduce.
+     * @return The composite reduce type.
+     */
+    ReduceType treeReduce(ReduceType lhs, ReduceType rhs);
+}
+

This method differs from reduce() in that while reduce() adds the result of a single map() call onto a running total, treeReduce() takes the aggregated results from multiple map/reduce tasks that have been run in parallel and combines them. So, lhs and rhs might each represent the final result from several hundred map/reduce calls.

+

Example treeReduce() implementation from the UnifiedGenotyper:

+
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
+    lhs.nBasesCallable += rhs.nBasesCallable;
+    lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
+    lhs.nBasesVisited += rhs.nBasesVisited;
+    lhs.nCallsMade += rhs.nCallsMade;
+    return lhs;
+}
\ No newline at end of file diff --git a/doc_archive/developer-zone/Managing_user_inputs.md b/doc_archive/developer-zone/Managing_user_inputs.md new file mode 100644 index 000000000..767621088 --- /dev/null +++ b/doc_archive/developer-zone/Managing_user_inputs.md @@ -0,0 +1,289 @@ +## Managing user inputs + +http://gatkforums.broadinstitute.org/gatk/discussion/1325/managing-user-inputs + +

1. Naming walkers

+

Users identify which GATK walker to run by specifying a walker name via the --analysis_type command-line argument. By default, the GATK will derive the walker name from a walker by taking the name of the walker class and removing packaging information from the start of the name, and removing the trailing text Walker from the end of the name, if it exists. For example, the GATK would, by default, assign the name PrintReads to the walker class org.broadinstitute.sting.gatk.walkers.PrintReadsWalker. To override the default walker name, annotate your walker class with @WalkerName("<my name>").

+

2. Requiring / allowing primary inputs

+

Walkers can flag exactly which primary data sources are allowed and required for a given walker. Reads, the reference, and reference-ordered data are currently considered primary data sources. Different traversal types have different default requirements for reads and reference, but currently no traversal types require reference-ordered data by default. You can add requirements to your walker with the @Requires / @Allows annotations as follows:

+
@Requires(DataSource.READS)
+@Requires({DataSource.READS,DataSource.REFERENCE})
+@Requires(value={DataSource.READS,DataSource.REFERENCE})
+@Requires(value=DataSource.REFERENCE})
+

By default, all parameters are allowed unless you lock them down with the @Allows attribute. The command:

+
@Allows(value={DataSource.READS,DataSource.REFERENCE})
+

will only allow the reads and the reference. Any other primary data sources will cause the system to exit with an error.

+

Note that as of August 2011, the GATK no longer supports RMD the @Requires and @Allows syntax, as these have moved to the standard @Argument system.

+

3. Command-line argument tagging

+

Any command-line argument can be tagged with a comma-separated list of freeform tags.

+

The syntax for tags is as follows:

+
-<argument>:<tag1>,<tag2>,<tag3> <argument value>
+

for example:

+
-I:tumor <my tumor data>.bam
+-eval,VCF yri.trio.chr1.vcf
+

There is currently no mechanism in the GATK to validate either the number of tags supplied or the content of those tags.

+

Tags can be accessed from within a walker by calling getToolkit().getTags(argumentValue), where argumentValue is the +parsed contents of the command-line argument to inspect.

+

Applications

+

The GATK currently has comprehensive support for tags on two built-in argument types:

+ +

From within a walker, use the following code to access the supplied tag or tags:

+
getToolkit().getReaderIDForRead(read).getTags();
+ +

4. Adding additional command-line arguments

+

Users can create command-line arguments for walkers by creating public member variables annotated with @Argument in the walker. The @Argument annotation takes a number of differentparameters:

+ +

By default, all command-line arguments will appear in the help system. To prevent new and debugging arguments from appearing in the help system, +you can add the @Hidden tag below the @Argument annotation, hiding it from the help system but allowing users to supply it on the command-line. +Please use this functionality sparingly to avoid walkers with hidden command-line options that are required for production use.

+

Passing Command-Line Arguments

+

Arguments can be passed to the walker using either the full name or the short name. If passing arguments using the full name, the syntax is −−<arg full name> <value>.

+
--myint 6
+

If passing arguments using the short name, the syntax is -<arg short name> <value>. Note that there is a space between the short name and the value:

+
-m 6
+

Boolean (class) and boolean (primitive) arguments are a special in that they require no argument. The presence of a boolean indicates true, and its absence indicates false. The following example sets a flag to true.

+
-B
+

Supplemental command-line argument annotations

+

Two additional annotations can influence the behavior of command-line arguments.

+ +

Examples

+

Create an required int parameter with full name –myint, short name -m. Pass this argument by adding –myint 6 or -m 6 to the command line.

+
import org.broadinstitute.sting.utils.cmdLine.Argument;
+public class HelloWalker extends ReadWalker<Integer,Long> {
+    @Argument(doc="my integer")
+    public int myInt;
+

Create an optional float parameter with full name –myFloatingPointArgument, short name -m. Pass this argument by adding –myFloatingPointArgument 2.71 or -m 2.71.

+
import org.broadinstitute.sting.utils.cmdLine.Argument;
+public class HelloWalker extends ReadWalker<Integer,Long> {
+    @Argument(fullName="myFloatingPointArgument",doc="a floating point argument",required=false)
+    public float myFloat;
+

The GATK will parse the argument differently depending on the type of the public member variable’s type. Many different argument types are supported, including primitives and their wrappers, arrays, typed and untyped collections, and any type with a String constructor. When the GATK cannot completely infer the type (such as in the case of untyped collections), it will assume that the argument is a String. GATK is aware of concrete implementations of some interfaces and abstract classes. If the argument’s member variable is of type List or Set, the GATK will fill the member variable with a concrete ArrayList or TreeSet, respectively. Maps are not currently supported.

+

5. Additional argument types: @Input, @Output

+

Besides @Argument, the GATK provides two additional types for command-line arguments: @Input and @Output. These two inputs are very similar to @Argument but act as flags to indicate dataflow to Queue, our pipeline management software.

+ +

We're still determining the best way to model walker dependencies in our pipeline. As we determine best practices, we'll post them here.

+

6. Getting access to Reference Ordered Data (RMD) with @Input and RodBinding

+

As of August 2011, the GATK now provides a clean mechanism for creating walker @Input arguments and using these arguments to access Reference Meta Data provided by the RefMetaDataTracker in the map() call. This mechanism is preferred to the old implicit string-based mechanism, which has been retired.

+

At a very high level, the new RodBindings provide a handle for a walker to obtain the Feature records from Tribble from a map() call, specific to a command line binding provided by the user. This can be as simple as a single ROD file argument|one-to-one binding between a command line argument and a track, or as complex as an argument argument accepting multiple command line arguments, each with a specific name. The RodBindings are generic and type specific, so you can require users to provide files that emit VariantContexts, BedTables, etc, or simply the root type Feature from Tribble. Critically, the RodBindings interact nicely with the GATKDocs system, so you can provide summary and detailed documentation for each RodBinding accepted by your walker.

+

A single ROD file argument

+

Suppose you have a walker that uses a single track of VariantContexts, such as SelectVariants, in its calculation. You declare a standard GATK-style @Input argument in the walker, of type RodBinding<VariantContext>:

+
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+public RodBinding<VariantContext> variants;
+

This will require the user to provide a command line option --variant:vcf my.vcf to your walker. To get access to your variants, in the map() function you provide the variants variable to the tracker, as in:

+
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
+

which returns all of the VariantContexts in variants that start at context.getLocation(). See RefMetaDataTracker in the javadocs to see the full range of getter routines.

+

Note that, as with regular tribble tracks, you have to provide the Tribble type of the file as a tag to the argument (:vcf). The system now checks up front that the corresponding Tribble codec produces Features that are type-compatible with the type of the RodBinding<T>.

+

RodBindings are generic

+

The RodBinding class is generic, parameterized as RodBinding<T extends Feature>. This T class describes the type of the Feature required by the walker. The best practice for declaring a RodBinding is to choose the most general Feature type that will allow your walker to work. For example, if all you really care about is whether a Feature overlaps the site in map, you can use Feature itself, which supports this, and will allow any Tribble type to be provided, using a RodBinding<Feature>. If you are manipulating VariantContexts, you should declare a RodBinding<VariantContext>, which will restrict automatically the user to providing Tribble types that can create a object consistent with the VariantContext class (a VariantContext itself or subclass).

+

Note that in multi-argument RodBindings, as List<RodBinding<T>> arg, the system will require all files provided here to provide an object of type T. So List<RodBinding<VariantContext>> arg requires all -arg command line arguments to bind to files that produce VariantContexts.

+

An argument that can be provided any number of times

+

The RodBinding system supports the standard @Argument style of allowing a vararg argument by wrapping it in a Java collection. For example, if you want to allow users to provide any number of comp tracks to your walker, simply declare a List<RodBinding<VariantContext>> field:

+
@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=true)
+public List<RodBinding<VariantContext>> comps;
+

With this declaration, your walker will accept any number of -comp arguments, as in:

+
-comp:vcf 1.vcf -comp:vcf 2.vcf -comp:vcf 3.vcf
+

For such a command line, the comps field would be initialized to the List with three RodBindings, the first binding to 1.vcf, the second to 2.vcf and finally the third to 3.vcf.

+

Because this is a required argument, at least one -comp must be provided. Vararg @Input RodBindings can be optional, but you should follow proper varargs style to get the best results.

+

Proper handling of optional arguments

+

If you want to make a RodBinding optional, you first need to tell the @Input argument that its options (required=false):

+
@Input(fullName="discordance", required=false)
+private RodBinding<VariantContext> discordanceTrack;
+

The GATK automagically sets this field to the value of the special static constructor method makeUnbound(Class c) to create a special "unbound" RodBinding here. This unbound object is type safe, can be safely passed to the RefMetaDataTracker get methods, and is guaranteed to never return any values. It also returns false when the isBound() method is called.

+

An example usage of isBound is to conditionally add header lines, as in:

+
if ( mask.isBound() ) {
+    hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
+}
+

The case for vararg style RodBindings is slightly different. If you want, as above, users to be able to omit the -comp track entirely, you should initialize the value of the collection to the appropriate emptyList/emptySet in Collections:

+
@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=false)
+public List<RodBinding<VariantContext>> comps = Collections.emptyList();
+

which will ensure that comps.isEmpty() is true when no -comp is provided.

+

Implicit and explicit names for RodBindings

+
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+public RodBinding<VariantContext> variants;
+

By default, the getName() method in RodBinding returns the fullName of the @Input. This can be overloaded on the command-line by providing not one but two tags. The first tag is interpreted as the name for the binding, and the second as the type. As in:

+
-variant:vcf foo.vcf     => getName() == "variant"
+-variant:foo,vcf foo.vcf => getName() == "foo"
+

This capability is useful when users need to provide more meaningful names for arguments, especially with variable arguments. For example, in VariantEval, there's a List<RodBinding<VariantContext>> comps, which may be dbsnp, hapmap, etc. This would be declared as:

+
@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=true)
+public List<RodBinding<VariantContext>> comps;
+

where a normal command line usage would look like:

+
-comp:hapmap,vcf hapmap.vcf -comp:omni,vcf omni.vcf -comp:1000g,vcf 1000g.vcf
+

In the code, you might have a loop that looks like:

+
for ( final RodBinding comp : comps )
+    for ( final VariantContext vc : tracker.getValues(comp, context.getLocation())
+        out.printf("%s has a binding at %s%n", comp.getName(), getToolkit().getGenomeLocParser.createGenomeLoc(vc)); 
+

which would print out lines that included things like:

+
hapmap has a binding at 1:10
+omni has a binding at 1:20
+hapmap has a binding at 1:30
+1000g has a binding at 1:30
+

This last example begs the question -- what happens with getName() when explicit names are not provided? The system goes out of its way to provide reasonable names for the variables:

+ +

In the above example, the command line

+
-comp:vcf hapmap.vcf -comp:vcf omni.vcf -comp:vcf 1000g.vcf
+

would emit

+
comp has a binding at 1:10
+comp2 has a binding at 1:20
+comp has a binding at 1:30
+comp3 has a binding at 1:30
+

Dynamic type resolution

+

The new RodBinding system supports a simple form of dynamic type resolution. If the input filetype can be specially associated with a single Tribble type (as VCF can), then you can omit the type entirely from the the command-line binding of a RodBinding!

+

So whereas a full command line would look like:

+
-comp:hapmap,vcf hapmap.vcf -comp:omni,vcf omni.vcf -comp:1000g,vcf 1000g.vcf
+

because these are VCF files they could technically be provided as:

+
-comp:hapmap hapmap.vcf -comp:omni omni.vcf -comp:1000g 1000g.vcf
+

If you don't care about naming, you can now say:

+
-comp hapmap.vcf -comp omni.vcf -comp 1000g.vcf
+

Best practice for documenting a RodBinding

+

The best practice is simple: use a javadoc style comment above the @Input annotation, with the standard first line summary and subsequent detailed discussion of the meaning of the argument. These are then picked up by the GATKdocs system and added to the standard walker docs, following the standard structure of GATKDocs @Argument docs. Below is a best practice documentation example from SelectVariants, which accepts a required variant track and two optional discordance and concordance tracks.

+
public class SelectVariants extends RodWalker<Integer, Integer> {
+   /**
+     * Variants from this file are sent through the filtering and modifying routines as directed
+     * by the arguments to SelectVariants, and finally are emitted.
+     */
+    @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+    public RodBinding<VariantContext> variants;
+
+    /**
+     * A site is considered discordant if there exists some sample in eval that has a non-reference genotype
+     * and either the site isn't present in this track, the sample isn't present in this track,
+     * or the sample is called reference in this track.
+     */
+    @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
+    private RodBinding<VariantContext> discordanceTrack;
+
+    /**
+     * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
+     * in both variants and concordance tracks or (2) every sample present in eval is present in the concordance
+     * track and they have the sample genotype call.
+     */
+    @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
+    private RodBinding<VariantContext> concordanceTrack;
+}
+

Note how much better the above version is compared to the old pre-Rodbinding syntax (code below). Below you have a required argument variant that doesn't show up as a formal argument in the GATK, different from the conceptually similar @Arguments for discordanceRodName and concordanceRodName, which have no type restrictions. There's no place to document the variant argument as well, so the system is effectively blind to this essential argument.

+
@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class))
+public class SelectVariants extends RodWalker<Integer, Integer> {
+    @Argument(fullName="discordance", shortName =  "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false)
+    private String discordanceRodName = "";
+
+    @Argument(fullName="concordance", shortName =  "conc", doc="Output variants that were also called on a ROD comparison track. Use -conc ROD_NAME", required=false)
+    private String concordanceRodName = "";
+}
+

RodBinding examples

+

In these examples, we have declared two RodBindings in the Walker

+
@Input(fullName="mask", doc="Input ROD mask", required=false)
+public RodBinding<Feature> mask = RodBinding.makeUnbound(Feature.class);
+
+@Input(fullName="comp", doc="Comparison track", required=false)
+public List<RodBinding<VariantContext>> comps = new ArrayList<VariantContext>();
+ +

Example usage in Queue scripts

+

In QScripts when you need to tag a file use the class TaggedFile which extends from java.io.File.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Examplein the QScripton the Command Line
Untagged VCFmyWalker.variant = new File("my.vcf")-V my.vcf
Tagged VCFmyWalker.variant = new TaggedFile("my.vcf", "VCF")-V:VCF my.vcf
Tagged VCFmyWalker.variant = new TaggedFile("my.vcf", "VCF,custom=value")-V:VCF,custom=value my.vcf
Labeling a tumormyWalker.input_file :+= new TaggedFile("mytumor.bam", "tumor")-I:tumor mytumor.bam
+

Notes

+

No longer need to (or can) use @Requires and @Allows for ROD data. This system is now retired.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md b/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md new file mode 100644 index 000000000..0abaada67 --- /dev/null +++ b/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md @@ -0,0 +1,102 @@ +## Managing walker data presentation and flow control + +http://gatkforums.broadinstitute.org/gatk/discussion/1351/managing-walker-data-presentation-and-flow-control + +

The primary goal of the GATK is to provide a suite of small data access patterns that can easily be parallelized and otherwise externally managed. As such, rather than asking walker authors how to iterate over a data stream, the GATK asks the user how data should be presented.

+

Locus walkers

+

Walk over the data set one location (single-base locus) at a time, presenting all overlapping reads, reference bases, and reference-ordered data.

+

1. Switching between covered and uncovered loci

+

The @By attribute can be used to control whether locus walkers see all loci or just covered loci. To switch between viewing all loci and covered loci, apply one of the following attributes:

+
@By(DataSource.REFERENCE)
+@By(DataSource.READS)
+

2. Filtering defaults

+

By default, the following filters are automatically added to every locus walker.

+ +

ROD walkers

+

These walkers walk over the data set one location at a time, but only those locations covered by reference-ordered data. They are essentially a special case of locus walkers. ROD walkers are read-free traversals that include operate over Reference Ordered Data and the reference genome at sites where there is ROD information. They are geared for high-performance traversal of many RODs and the reference such as VariantEval and CallSetConcordance. Programmatically they are nearly identical to RefWalkers<M,T> traversals with the following few quirks.

+

1. Differences from a RefWalker

+ +

In order to get the final count of skipped bases at the end of an interval (or chromosome) the map function is called one last time with null ReferenceContext and RefMetaDataTracker objects. The alignment context can be accessed to get the bases skipped between the last (and final) ROD and the end of the current interval.

+

2. Filtering defaults

+

ROD walkers inherit the same filters as locus walkers:

+ +

3. Example change over of VariantEval

+

Changing to a RODWalker is very easy -- here's the new top of VariantEval, changing the system to a RodWalker from its old RefWalker state:

+
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
+public class VariantEvalWalker extends RodWalker<Integer, Integer> {
+

The map function must now capture the number of skipped bases and protect itself from the final interval map calls:

+
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    nMappedSites += context.getSkippedBases();
+
+    if ( ref == null ) { // we are seeing the last site
+        return 0;
+    }
+
+    nMappedSites++;
+

That's all there is to it!

+

4. Performance improvements

+

A ROD walker can be very efficient compared to a RefWalker in the situation where you have sparse RODs. Here is a comparison of ROD vs. Ref walker implementation of VariantEval:

+ + + + + + + + + + + + + + + + + + + + +
RODWalkerRefWalker
dbSNP and 1KG Pilot 2 SNP calls on chr1164u (s)768u (s)
Just 1KG Pilot 2 SNP calls on chr154u (s)666u (s)
+

Read walkers

+

Read walkers walk over the data set one read at a time, presenting all overlapping reference bases and reference-ordered data.

+

Filtering defaults

+

By default, the following filters are automatically added to every read walker.

+ +

Read pair walkers

+

Read pair walkers walk over a queryname-sorted BAM, presenting each mate and its pair. No reference bases or reference-ordered data are presented.

+

Filtering defaults

+

By default, the following filters are automatically added to every read pair walker.

+ +

Duplicate walkers

+

Duplicate walkers walk over a read and all its marked duplicates. No reference bases or reference-ordered data are presented.

+

Filtering defaults

+

By default, the following filters are automatically added to every duplicate walker.

+ \ No newline at end of file diff --git a/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md b/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md new file mode 100644 index 000000000..4c56bb44d --- /dev/null +++ b/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md @@ -0,0 +1,174 @@ +## Migration from Apache Ant to Apache Maven + +http://gatkforums.broadinstitute.org/gatk/discussion/3437/migration-from-apache-ant-to-apache-maven + +

Overview

+
+

We're replacing Ant with Maven. To build, run mvn verify.

+

Background

+

In the early days of the Genome Analysis Toolkit (GATK), the code base separated the GATK genomics engine from the core java utilities, encompassed in a wider project called Sting. During this time, the build tool of choice was the relatively flexible Java build tool Apache Ant, run via the command ant.

+

As our code base expanded to more and more packages, groups internal and external to GSA, and the Broad, have expressed interest in using portions of Sting/GATK as modules in larger projects. Unfortunately over time, many parts of the GATK and Sting intermingled, producing the current situation where developers finds it easier to copy the monolithic GATK instead, or individual java files, instead of using the tools as libraries.

+

The goal of this first stage is to split the parts of the monolithic Sting/GATK into easily recognizable sub artifacts. The tool used to accomplish this task is Apache Maven, also known as Maven, and run via the command mvn. Maven convention encourages developers to separate code, and accompanying resources, into a hierarchical structure of reusable artifacts. Maven attempts to avoid build configuration, preferring source repositories to lay out code in a conventional structure. When needed, a Maven configuration file called pom.xml specifies each artifact's build configuration, that one may think of as similar to an Ant build.xml.

+

The actual migration consisted of zero changes to the contents of existing Java source files, easing git merges and rebasing. The Java files from public, protected, and private have all moved into Maven conventional child artifacts, with each artifact containing a separate pom.xml.

+

Examples

+

Obtaining the GATK with Maven support

+

Clone the repository:

+

git clone ssh://git@github.com/broadinstitute/gsa-unstable.git cd gsa-unstable

+

Building GATK and Queue

+

Clone the repository:

+

git clone ssh://git@github.com/broadinstitute/gsa-unstable.git cd gsa-unstable

+

If running on a Broad server, add maven to your environment via the dotkit:

+

reuse Maven-3.0.3

+

Build all of Sting, including packaged versions of the GATK and Queue:

+

mvn verify

+

The packaged, executable jar files will be output to:

+

public/gatk-package/target/gatk-package-2.8-SNAPSHOT.jar public/queue-package/target/queue-package-2.8-SNAPSHOT.jar

+

Find equivalent maven commands for existing ant targets:

+

./ant-bridge.sh <target> <properties>

+

Example output:

+

$ ./ant-bridge.sh fasttest -Dsingle=GATKKeyUnitTest Equivalent maven command mvn verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true -Dit.test=disabled -Dtest=GATKKeyUnitTest $

+

Running the GATK and Queue

+

To run the GATK, or copy the compiled jar, find the packaged jar under public/gatk-package/target

+

public/gatk-package/target/gatk-package-2.8-SNAPSHOT.jar

+

To run Queue, the jar is under the similarly named public/queue-package/target

+

public/queue-package/target/queue-package-2.8-SNAPSHOT.jar

+

NOTE: Unlike builds with Ant, you cannot execute the jar file built by the gatk-framework module. This is because maven does not include dependent artifacts in the target folder with assembled framework jar. Instead, use the packaged jars, listed above, that contain all the classes and resources needed to run the GATK, or Queue.

+

Excluding Queue

+

NOTE: If you make changes to sting-utils, gatk-framework, or any other dependencies and disable queue, you may accidentally end up breaking the full repository build without knowing.

+

The Queue build contributes a majority portion of the Sting project build time. To exclude Queue from your build, run maven with either (the already shell escaped) -P\!queue or -Ddisable.queue. Currently the latter property also disables the maven queue profile. This allows one other semi-permanent option to disable building Queue as part of the Sting repository. Configure your local Maven settings to always pass the property -Ddisable.queue by adding and activating a custom profile in your local ~/.m2/settings.xml

+

```$ cat ~/.m2/settings.xml

+ + + + + + disable.queue + + true + + + + + + disable.queue + + +

$```

+

Using the GATK framework as a module

+

Currently the GATK artifacts are not available via any centralized repository. To build code using the GATK you must still have a checkout of the GATK source code, and install the artifacts to your local mvn repository (by default ~/.m2/repository). The installation copies the artifacts to your local repo such that it may be used by your external project. The checkout of the local repo provides several artifacts under public/repo that will be required for your project.

+

After updating to the latest version of the Sting source code, install the Sting artifacts via:

+

mvn install

+

After the GATK has been installed locally, in your own source repository, include the artifact gatk-framework as a library.

+

In Apache Maven add this dependency:

+

```

+org.broadinstitute.sting +
<artifactId>gatk-framework</artifactId>
+<version>2.8-SNAPSHOT</version>
+

```

+

For Apache Ivy, you may need to specify ~/.m2/repository as a local repo. Once the local repository has been configured, ivy may find the dependency via:

+

<dependency org="org.broadinstitute.sting" name="gatk-framework" rev="2.8-SNAPSHOT" />

+

If you decide to also use Maven to build your project, your source code should go under the conventional directory src/main/java. The pom.xml contains any special configuration for your project. To see an example pom.xml and maven conventional project structure in:

+

public/external-example

+

Moved directories

+

If you have an old git branch that needs to be merged, you may need to know where to move files in order for your classes to now build with Maven. In general, most directories were moved with minimal or no changes.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Old directoryNew maven directory
private/java/src/private/gatk-private/src/main/java/
private/R/scripts/private/gatk-private/src/main/resources/
private/java/test/private/gatk-private/src/test/java/
private/testdata/private/gatk-private/src/test/resources/
private/scala/qscript/private/queue-private/src/main/qscripts/
private/scala/src/private/queue-private/src/main/scala/
private/scala/test/private/queue-private/src/test/scala/
protected/java/src/protected/gatk-protected/src/main/java/
protected/java/test/protected/gatk-protected/src/test/java/
public/java/src/public/gatk-framework/src/main/java/
public/java/test/public/gatk-framework/src/test/java/
public/testdata/public/gatk-framework/src/test/resources/
public/scala/qscript/public/queue-framework/src/main/qscripts/
public/scala/src/public/queue-framework/src/main/scala/
public/scala/test/public/queue-framework/src/test/scala/
+

Future Directions

+

Further segregate source code

+

Currently, the artifacts sting-utils and the gatk-framework contain intertwined code bases. This leads to the current setup where all sting-utils code is actually found in the gatk-framework artifact, including generic utilities that could be used by other software modules. In the future, all elements under org.broadinstitute.sting.gatk will be located the gatk-framework, while all other packages under org.broadinstitut.sting will be evaluated and then separated under the gatk-framework or sting-utils artifacts.

+

Publishing artifacts

+

Tangentially related to segregating sting-utils and the gatk-framework, the current Sting and GATK artifacts are ineligible to be pushed to the Maven Central Repository, due to several other issues:

+ +

NOTE: Artifact jars do NOT need to actually be in Central, and may be available as pom reference only, for example Oracle ojdbc.

+

In the near term, we could use a private repos based on Artifactory or Nexus (comparison). After more work of adding, cleaning up, or centrally publishing all the dependencies for Sting, we may then publish into the basic Central repo. Or, we could move to a social service like BinTray (think GitHub vs. Git).

+

Status Updates

+

February 13, 2014

+

Maven is now the default in gsa-unstable's master branch. For GATK developers, the git migration is effectively complete. Software engineers are resolving a few remaining issues related to the automated build and testing infrastructure, but the basic workflow for developers should now be up to date.

+

January 30, 2014

+

The migration to to maven has begun in the gsa-unstable repository on the ks_new_maven_build_system branch.

+

November 5, 2013

+

The maven port of the existing ant build resides in the gsa-qc repository.

+

This is an old branch of Sting/GATK, with the existing files relocated to Maven appropriate locations, pom.xml files added, along with basic resources to assist in artifact generation.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md b/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md new file mode 100644 index 000000000..fbae8d505 --- /dev/null +++ b/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md @@ -0,0 +1,40 @@ +## Notes on downsampling in HC/M2 + +http://gatkforums.broadinstitute.org/gatk/discussion/8028/notes-on-downsampling-in-hc-m2 + +

This document aims to record some developer notes for posterity. Contents were generated July 24, 2015 and are not guaranteed to be up to date. No support guarantee either.

+
+

Arguments and Parameters

+ +

Relevant Code

+ +

Worst Case M2 Behavior

+ \ No newline at end of file diff --git a/doc_archive/developer-zone/Output_management.md b/doc_archive/developer-zone/Output_management.md new file mode 100644 index 000000000..5991f7df5 --- /dev/null +++ b/doc_archive/developer-zone/Output_management.md @@ -0,0 +1,113 @@ +## Output management + +http://gatkforums.broadinstitute.org/gatk/discussion/1327/output-management + +

1. Introduction

+

When running either single-threaded or in shared-memory parallelism mode, the GATK guarantees that output written to an output stream created via the @Argument mechanism will ultimately be assembled in genomic order. In order to assemble the final output file, the GATK will write the output generated from each thread into a temporary output file, ultimately assembling the data via a central coordinating thread. There are three major elements in the GATK that facilitate this functionality:

+ +

2. Basic Mechanism

+

Stubs are directly injected into the walker through the GATK's command-line argument parser as a go-between from walker to output management system. When a walker calls into the stub it's first responsibility is to call into the output tracker to retrieve an appropriate storage object. The behavior of the OutputTracker from this point forward depends mainly on the parallelization mode of this traversal of the GATK.

+

If the traversal is single-threaded:

+ +

If the traversal is multi-threaded using shared-memory parallelism:

+ +

3. Using output management

+

To use the output management system, declare a field in your walker of one of the existing core output types, coupled with either an @Argument or @Output annotation.

+
@Output(doc="Write output to this BAM filename instead of STDOUT")
+SAMFileWriter out;
+

Currently supported output types are SAM/BAM (declare SAMFileWriter), VCF (declare VCFWriter), and any non-buffering stream extending from OutputStream.

+

4. Implementing a new output type

+

To create a new output type, three types must be implemented: Stub, Storage, and ArgumentTypeDescriptor.

+

To implement Stub

+

Create a new Stub class, extending/inheriting the core output type's interface and implementing the Stub interface.

+
OutputStreamStub extends OutputStream implements Stub<OutputStream> {
+

Implement a register function so that the engine can provide the stub with the session's OutputTracker.

+
public void register( OutputTracker outputTracker ) {
+    this.outputTracker = outputTracker;
+}
+

Add as fields any parameters necessary for the storage object to create temporary storage.

+
private final File targetFile;
+public File getOutputFile() { return targetFile; }
+

Implement/override every method in the core output type's interface to pass along calls to the appropriate storage object via the OutputTracker.

+
public void write( byte[] b, int off, int len ) throws IOException {
+    outputTracker.getStorage(this).write(b, off, len);
+}
+

To implement Storage

+

Create a Storage class, again extending inheriting the core output type's interface and implementing the Storage interface.

+
public class OutputStreamStorage extends OutputStream implements Storage<OutputStream> {
+

Implement constructors that will accept just the Stub or Stub + alternate file path and create a repository for data, and a close function that will close that repository.

+
public OutputStreamStorage( OutputStreamStub stub ) { ... }
+public OutputStreamStorage( OutputStreamStub stub, File file ) { ... }
+public void close() { ... }
+

Implement a mergeInto function capable of reconstituting the file created by the constructor, dumping it back into the core output type's interface, and removing the source file.

+
public void mergeInto( OutputStream targetStream ) { ... }
+

Add a block to StorageFactory.createStorage() capable of creating the new storage object. TODO: use reflection to generate the storage classes.

+
    if(stub instanceof OutputStreamStub) {
+        if( file != null )
+            storage = new OutputStreamStorage((OutputStreamStub)stub,file);
+        else
+            storage = new OutputStreamStorage((OutputStreamStub)stub);
+    }
+

To implement ArgumentTypeDescriptor

+

Create a new object inheriting from type ArgumentTypeDescriptor. Note that the ArgumentTypeDescriptor does NOT need to support the core output type's interface.

+
public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {
+

Implement a truth function indicating which types this ArgumentTypeDescriptor can service.

+
 @Override
+ public boolean supports( Class type ) {
+     return SAMFileWriter.class.equals(type) || StingSAMFileWriter.class.equals(type);
+ }
+

Implement a parse function that constructs the new Stub object. The function should register this type as an output by caling engine.addOutput(stub).

+
 public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches )  {
+     ...
+     OutputStreamStub stub = new OutputStreamStub(new File(fileName));
+     ...
+     engine.addOutput(stub);
+     ....
+     return stub;
+}
+

Add a creator for this new ArgumentTypeDescriptor in CommandLineExecutable.getArgumentTypeDescriptors().

+
 protected Collection<ArgumentTypeDescriptor> getArgumentTypeDescriptors() {
+     return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources),
+                           new SAMFileWriterArgumentTypeDescriptor(engine,System.out),
+                           new OutputStreamArgumentTypeDescriptor(engine,System.out) );
+ }
+

After creating these three objects, the new output type should be ready for usage as described above.

+

5. Outstanding issues

+ \ No newline at end of file diff --git a/doc_archive/developer-zone/Scala_resources.md b/doc_archive/developer-zone/Scala_resources.md new file mode 100644 index 000000000..3b541d582 --- /dev/null +++ b/doc_archive/developer-zone/Scala_resources.md @@ -0,0 +1,32 @@ +## Scala resources + +http://gatkforums.broadinstitute.org/gatk/discussion/1897/scala-resources + +

References for Scala development

+

The online course Functional Programming Principles in Scala taught by Martin Odersky, creator of Scala, and a Cheat Sheet for that course

+

Scala by Example (PDF) - also by Martin Odersky

+

First Steps to Scala

+

Programming Scala - O'Reilly Media

+

Scala School - Twitter

+

Scala Style Guide

+

A Concise Introduction To Scala

+

Scala Operator Cheat Sheet

+

A Tour of Scala

+

Stack Overflow

+ +

A Conversation with Martin Odersky

+
    +
  1. The Origins of Scala
  2. +
  3. The Goals of Scala's Design
  4. +
  5. The Purpose of Scala's Type System
  6. +
  7. The Point of Pattern Matching in Scala
  8. +
+

Scala Collections for the Easily Bored

+
    +
  1. A Tale of Two Flavors
  2. +
  3. One at a Time
  4. +
  5. All at Once
  6. +
\ No newline at end of file diff --git a/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md b/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md new file mode 100644 index 000000000..ff9954be8 --- /dev/null +++ b/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md @@ -0,0 +1,48 @@ +## Seeing deletion spanning reads in LocusWalkers + +http://gatkforums.broadinstitute.org/gatk/discussion/1348/seeing-deletion-spanning-reads-in-locuswalkers + +

1. Introduction

+

The LocusTraversal now supports passing walkers reads that have deletions spanning the current locus. This is useful in many situation where you want to calculate coverage, call variants and need to avoid calling variants where there are a lot of deletions, etc.

+

Currently, the system by default will not pass you deletion-spanning reads. In order to see them, you need to overload the function:

+
/**
+ * (conceptual static) method that states whether you want to see reads piling up at a locus
+ * that contain a deletion at the locus.
+ *
+ * ref:   ATCTGA
+ * read1: ATCTGA
+ * read2: AT--GA
+ *
+ * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but
+ * if this function returns true, then the system will return (read1, read2) with offsets
+ * of (3, -1).  The -1 offset indicates a deletion in the read.
+ *
+ * @return false if you don't want to see deletions, or true if you do
+ */
+public boolean includeReadsWithDeletionAtLoci() { return true; }
+

in your walker. Now you will start seeing deletion-spanning reads in your walker. These reads are flagged with offsets of -1, so that you can:

+
    for ( int i = 0; i < context.getReads().size(); i++ ) {
+        SAMRecord read = context.getReads().get(i);
+        int offset = context.getOffsets().get(i);
+
+       if ( offset == -1 ) 
+               nDeletionReads++;
+        else 
+               nCleanReads++;
+    }
+

There are also two convenience functions in AlignmentContext to extract subsets of the reads with and without spanning deletions:

+
/**
+ * Returns only the reads in ac that do not contain spanning deletions of this locus
+ * 
+ * @param ac
+ * @return
+ */
+public static AlignmentContext withoutSpanningDeletions( AlignmentContext ac );
+
+/**
+ * Returns only the reads in ac that do contain spanning deletions of this locus
+ * 
+ * @param ac
+ * @return
+ */
+public static AlignmentContext withSpanningDeletions( AlignmentContext ac );
\ No newline at end of file diff --git a/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md b/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md new file mode 100644 index 000000000..8eeb5b6cb --- /dev/null +++ b/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md @@ -0,0 +1,85 @@ +## Setting up your dev environment: Maven and IntelliJ for GATK 3+ + +http://gatkforums.broadinstitute.org/gatk/discussion/4023/setting-up-your-dev-environment-maven-and-intellij-for-gatk-3 + +

Overview

+

Since GATK 3.0, we use Apache Maven (instead of Ant) as our build system, and IntelliJ as our IDE (Integrated Development Environment). This document describes how to get set up to use Maven as well as how to create an IntelliJ project around our Maven project structure.

+

Before you start

+ +

Setting up Maven

+
    +
  1. +

    Check whether you can run mvn --version on your machine. If you can't, install Maven from here.

    +
  2. +
  3. +

    Ensure that the JAVA_HOME environment variable is properly set. If it's not, add the appropriate line to your shell's startup file:

    +

    for tcsh:

    +
    setenv JAVA_HOME  \`/usr/libexec/java_home\`
    +

    for bash:

    +
    export JAVA_HOME=\`/usr/libexec/java_home\`
    +
  4. +
+

Note that the commands above use backticks, not single quotes.

+

Basic Maven usage

+
    +
  1. +

    To compile everything, type:

    +
    mvn verify
    +
  2. +
  3. +

    To compile the GATK but not Queue (much faster!), the command is:

    +
    mvn verify -P\!queue
    +

    Note that the ! needs to be escaped with a backslash to avoid interpretation by the shell.

    +
  4. +
  5. +

    To obtain a clean working directory, type:

    +
    mvn clean
    +
  6. +
  7. +

    If you're used to using ant to compile the GATK, you should be able to feed your old ant commands to the ant-bridge.sh script in the root directory. For example:

    +
    ./ant-bridge.sh test -Dsingle=MyTestClass
    +
  8. +
+

Setting up IntelliJ

+
    +
  1. +

    Run mvn test-compile in your git clone's root directory.

    +
  2. +
  3. +

    Open IntelliJ

    +
  4. +
  5. +

    File -> import project, select your git clone directory, then click "ok"

    +
  6. +
  7. +

    On the next screen, select "import project from external model", then "maven", then click "next"

    +
  8. +
  9. +

    Click "next" on the next screen without changing any defaults -- in particular:

    +
      +
    • DON'T check "Import maven projects automatically"
    • +
    • DON'T check "Create module groups for multi-module maven projects"
    • +
    +
  10. +
  11. +

    On the "Select Profiles" screen, make sure private and protected ARE checked, then click "next".

    +
  12. +
  13. +

    On the next screen, the "gatk-aggregator" project should already be checked for you -- if not, then check it. Click "next".

    +
  14. +
  15. +

    Select the 1.7 SDK, then click "next".

    +
  16. +
  17. +

    Select an appropriate project name (can be anything), then click "next" (or "finish", depending on your version of IntelliJ).

    +
  18. +
  19. +

    Click "Finish" to create the new IntelliJ project.

    +
  20. +
  21. +

    That's it! Due to Maven magic, everything else will be set up for you automatically, including modules, libraries, Scala facets, etc.

    +
  22. +
  23. You will see a popup "Maven projects need to be imported" on every IntelliJ startup. You should click import unless you're working on the actual pom files that make up the build system.
  24. +
\ No newline at end of file diff --git a/doc_archive/developer-zone/Sting_to_GATK_renaming.md b/doc_archive/developer-zone/Sting_to_GATK_renaming.md new file mode 100644 index 000000000..ea0de6abd --- /dev/null +++ b/doc_archive/developer-zone/Sting_to_GATK_renaming.md @@ -0,0 +1,736 @@ +## Sting to GATK renaming + +http://gatkforums.broadinstitute.org/gatk/discussion/4173/sting-to-gatk-renaming + +

Overview

+

The GATK 3.2 source code uses new java package names, directory paths, and executable jars. Post GATK 3.2, any patches submitted via pull requests should also include classes moved to the appropriate artifact.

+

Note that the document includes references to the private module, which is part of our internal development codebase but is not available to the general public.

+

Summary

+

A long term ideal of the GATK is to separate out reusable parts and eventually make them available as compiled libraries via centralized binary repositories. Ahead of publishing a number of steps must be completed. One of the larger steps has been completed for GATK 3.2, where the code base rebranded all references of Sting to GATK.

+

Currently implemented changes include:

+ +

As of May 16, 2014, remaining TODOs ahead of publishing to central include:

+ +

Now that the new package names and Maven artifacts are available, any pull request should include ensuring that updated classes are also moved into the correct GATK Maven artifact. While there are a significant number of classes, cleaning up as we go along will allow the larger task to be completed in a distributed fashion.

+

The full lists of new Maven artifacts and renamed packages are below under [Renamed Artifact Directories]. For those developers in the middle of a git rebase around commits before and after 3.2, here is an abridged mapping of renamed directories for those trying to locate files:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Old Maven ArtifactNew Maven Artifact
public/sting-rootpublic/gatk-root
public/sting-utilspublic/gatk-utils
public/gatk-frameworkpublic/gatk-tools-public
public/queue-frameworkpublic/gatk-queue
protected/gatk-protectedprotected/gatk-tools-protected
private/gatk-privateprivate/gatk-tools-private
private/queue-privateprivate/gatk-queue-private
+

QScripts are no longer located with the Queue engine, and instead are now located with the GATK wrappers implemented as Queue extensions. See [Separated Queue Extensions] for more info.

+

Changes

+

Separating the GATK Engine and Tools

+

Starting with GATK 3.2, separate Maven utility artifacts exist to separate reusable portions of the GATK engine apart from tool specific implementations. The biggest impact this will have on developers is the separation of the walkers packages.

+

In GATK versions <= 3.1 there was one package for both the base classes and the implementations of walkers:

+ +

In GATK versions >= 3.2 threre are two packages. The first contains the base interfaces, annotations, etc. The latter package is for the concrete tools implemented as walkers:

+ +

Renamed Binary Packages

+

Previously, depending on how the source code was compiled, the executable gatk-package-3.1.jar and queue-package-3.1.jar (aka GenomeAnalysisTK.jar and Queue.jar) contained various mixes of public/protected/private code. For example, if the private directory was present when the source code was compiled, the same artifact named gatk-package-3.1.jar might, or might not contain private code.

+

Starting with 3.2, there are two versions of the jar created, each with specific file contents.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
New Maven ArtifactAlias in the /target folderPackaged contents
gatk-package-distribution-3.2.jarGenomeAnalysisTK.jarpublic,protected
gatk-package-internal-3.2.jarGenomeAnalysisTK-internal.jarpublic,protected,private
gatk-queue-package-distribution-3.2.jarQueue.jarpublic,protected
gatk-queue-package-internal-3.2.jarQueue-internal.jarpublic,protected,private
+

Separated Queue Extensions

+

When creating a packaged version of Queue, the GATKExtensionsGenerator builds Queue engine compatible command line wrappers around each GATK walker. Previously, the wrappers were generated during the compilation of the Queue framework. Similar to the binary packages, depending on who built the source code, queue-framework-3.1.jar would contain various mixes of public/protected/private wrappers.

+

Starting with GATK 3.2, the gatk-queue-3.2.jar only contains code for the Queue engine. Generated and manually created extensions for wrapping any other command line programs are all included in separate artifacts. Due to a current limitation regarding how the generator uses reflection, the generator cannot build wrappers for just private classes without also generating protected and public classes. Thus, there are three different Maven artifacts generated, that contain different mixes of public, protected and private wrappers.

+ + + + + + + + + + + + + + + + + + + + + +
Extensions ArtifactGenerated wrappers for GATK tools
gatk-queue-extensions-public-3.2.jarpublic only
gatk-queue-extensions-distribution-3.2.jarpublic,protected
gatk-queue-extensions-internal-3.2.jarpublic,protected,private
+

As for QScripts that used to be located with the framework, they are now located with the generated wrappers.

+ + + + + + + + + + + + + + + + + +
Old QScripts Artifact DirectoryNew QScripts Artifact Directory
public/queue-framework/src/main/qscriptspublic/gatk-queue-extensions-public/src/main/qscripts
private/queue-private/src/main/qscriptsprivate/gatk-queue-extensions-internal/src/main/qscripts
+

Renamed Artifact Directories

+

The following list shows the mapping of artifact names pre and post GATK 3.2. In addition to the engine changes, the packaging updates and extensions changes above also affected Maven artifact refactoring. The packaging artifacts have split from a single public to protected and private versions, and new queue extensions artifacts have been added as well.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Maven Artifact <= GATK 3.1Maven Artifact >= GATK 3.2
/pom.xml (sting-aggregator)/pom.xml _(gatkaggregator)
public/sting-rootpublic/gatk-root
public/sting-utilspublic/gatk-utils
nonepublic/gatk-engine
public/gatk-frameworkpublic/gatk-tools-public
public/queue-frameworkpublic/gatk-queue
public/gatk-queue-extgenpublic/gatk-queue-extensions-generator
protected/gatk-protectedprotected/gatk-tools-protected
private/gatk-privateprivate/gatk-tools-private
private/queue-privateprivate/gatk-queue-private
public/gatk-packageprotected/gatk-package-distribution
public/queue-packageprotected/gatk-queue-package-distribution
noneprivate/gatk-package-internal
noneprivate/gatk-queue-package-internal
nonepublic/gatk-queue-extensions-public
noneprotected/gatk-queue-extensions-distribution
noneprivate/gatk-queue-extensions-internal
+

A note regarding the aggregator:

+

The aggregator is the pom.xml in the top directory level of the GATK source code. When someone clones the GATK source code and runs mvn in the top level directory, the aggregator the pom.xml executed.

+

The root is a pom.xml that contains all common Maven configuration. There are a couple dependent pom.xml files that inherit configuration from the root, but are NOT aggregated during normal source compilation.

+

As of GATK 3.2, these un-aggregated child artifacts are VectorPairHMM and picard-maven. They should not run by default with each instance of mvn run on the GATK source code.

+

For more clarification on Maven Inheritance vs. Aggregation, see the Maven introduction to the pom.

+

Renamed Java/Scala Package Names

+

In GATK 3.2, except for classes with Sting in the name, all file names are still the same. To locate migrated files under new java package names, developers should either use Intellij IDEA Navigation or /bin/find to locate the same file they used previously.

+

The biggest change most developers will face is the new package names for GATK classes. Code entanglement does not permit simply moving the classes into the correct Maven artifacts, as a few number of lines of code must be edited inside a large number of files. So post renaming only a very small number of classes were moved out of the incorrect Maven artifacts as examples.

+

As of the May 16, 2014, the migrated GATK package distribution is as follows. This list includes only main classes. The table excludes all tests, renamed files such as StingException, certain private Queue wrappers, and qscripts renamed to end in *.scala.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ScopeType<= 3.1 Artifact<= 3.1 Package>= GATK 3.2 Artifact>= 3.2 GATK PackageFiles
publicjavagatk-frameworko.b.sgatk-utilso.b.g4
publicjavagatk-frameworko.b.s.gatkgatk-engineo.b.g.engine2
publicjavagatk-frameworko.b.sgatk-tools-publico.b.g202
publicjavagatk-frameworko.b.sgatk-tools-publico.b.g.utils49
publicjavagatk-frameworko.b.sgatk-tools-publico.b.g.engine34
publicjavagatk-frameworko.b.s.gatkgatk-tools-publico.b.g.engine244
publicjavagatk-frameworko.b.s.gatkgatk-tools-publico.b.g.tools134
publicjavagatk-frameworko.b.s.gatkgatk-tools-publico.b.g.tools.walkers2
protectedjavagatk-protectedo.b.sgatk-tools-protectedo.b.g44
protectedjavagatk-protectedo.b.s.gatkgatk-tools-protectedo.b.g.engine1
protectedjavagatk-protectedo.b.s.gatkgatk-tools-protectedo.b.g.tools209
privatejavagatk-privateo.b.sgatk-tools-privateo.b.g23
privatejavagatk-privateo.b.sgatk-tools-privateo.b.g.utils7
privatejavagatk-privateo.b.s.gatkgatk-tools-privateo.b.g.engine5
privatejavagatk-privateo.b.s.gatkgatk-tools-privateo.b.g.tools133
publicjavaqueue-frameworko.b.sgatk-queueo.b.g2
publicscalaqueue-frameworko.b.sgatk-queueo.b.g72
publicscalaqueue-frameworko.b.sgatk-queue-extensions-publico.b.g31
publicqscriptsqueue-frameworko.b.sgatk-queue-extensions-publico.b.g12
privatescalaqueue-privateo.b.sgatk-queue-privateo.b.g2
privateqscriptsqueue-privateo.b.sgatk-queue-extensions-internalo.b.g118
+

During all future code modifications and pull requests, classes should be refactored to correct artifacts and package as follows.

+

All non-engine tools should be in the tools artifacts, with appropriate sub-package names.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ScopeTypeArtifactPackage(s)
publicjavagatk-utilso.b.g.utils
publicjavagatk-engineo.b.g.engine
publicjavagatk-tools-publico.b.g.tools.walkers
publicjavagatk-tools-publico.b.g.tools.*
protectedjavagatk-tools-protectedo.b.g.tools.walkers
protectedjavagatk-tools-protectedo.b.g.tools.*
privatejavagatk-tools-privateo.b.g.tools.walkers
privatejavagatk-tools-privateo.b.g.tools.*
publicjavagatk-queueo.b.g.queue
publicscalagatk-queueo.b.g.queue
publicscalagatk-queue-extensions-publico.b.g.queue.extensions
publicqscriptsgatk-queue-extensions-publico.b.g.queue.qscripts
privatescalagatk-queue-privateo.b.g.queue
privateqscriptsgatk-queue-extensions-internalo.b.g.queue.qscripts
+

Renamed Classes

+

The following class names were updated to replace Sting with GATK.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Old Sting classNew GATK class
ArtificialStingSAMFileWriterArtificialGATKSAMFileWriter
ReviewedStingExceptionReviewedGATKException
StingExceptionGATKException
StingSAMFileWriterGATKSAMFileWriter
StingSAMIteratorGATKSAMIterator
StingSAMIteratorAdapterGATKSAMIteratorAdapter
StingSAMRecordIteratorGATKSAMRecordIterator
StingTextReporterGATKTextReporter
+

Common Git/Maven Issues

+

Renamed files

+

The 3.2 renaming patch is actually split into two commits. The first commit renames the files without making any content changes, while the second changes the contents of the files without changing any file paths.

+

When dealing with renamed files, it is best to work with a clean directory during rebasing. It will be easier for you track files that you may not have added to git.

+

After running a git rebase or merge, you may first run into problems with files that you renamed and were moved during the GATK 3.2 package renaming. As a general rule, the renaming only changes directory names. The exception to this rule are classes such as StingException that are renamed to GATKException, and are listed under [Renamed Classes]. The workflow for resolving these merge issues is to find the list of your renamed files, put your content in the correct location, then register the changes with git.

+

To obtain the list of renamed directories and files:

+
    +
  1. Use git status to get a list of affected files
  2. +
  3. Find the common old directory and file name under "both deleted"
  4. +
  5. Find your new file name under "added by them" (yes, you are "them")
  6. +
  7. Find the new directory under "added by us"
  8. +
+

Then, to resolve the issue for each file:

+
    +
  1. Move your copy of your renamed file to the new directory
  2. +
  3. git rm the old paths as appropriate
  4. +
  5. git add the new path
  6. +
  7. Repeat for other files until git status shows "all conflicts fixed"
  8. +
+

Upon first rebasing you will see a lot of text. At this moment, you can ignore most of it, and use git status instead.

+

For the purposes of illustration, while running git rebase it is perfectly normal to see something similar to:

+
$ git rebase master
+First, rewinding head to replay your work on top of it...
+Applying: <<< Your first commit message here >>>
+Using index info to reconstruct a base tree...
+A   protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+A   protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+<<<Other files that you renamed.>>>
+warning: squelched 12 whitespace errors
+warning: 34 lines add whitespace errors.
+Falling back to patching base and 3-way merge...
+CONFLICT (rename/rename): Rename "protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java"->"protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java" in branch "HEAD" rename "protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java"->"protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java" in "<<< Your first commit message here >>>"
+CONFLICT (rename/rename): Rename "protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java"->"protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java" in branch "HEAD" rename "protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java"->"protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java" in "<<< Your first commit message here >>>"
+Failed to merge in the changes.
+Patch failed at 0001 Example conflict.
+The copy of the patch that failed is found in:
+   /Users/zzuser/src/gsa-unstable/.git/rebase-apply/patch
+
+When you have resolved this problem, run "git rebase --continue".
+If you prefer to skip this patch, run "git rebase --skip" instead.
+To check out the original branch and stop rebasing, run "git rebase --abort".
+
+$
+

While everything you need to resolve the issue is technically in the message above, it may be much easier to track what's going on using git status.

+
$ git status
+rebase in progress; onto cba4321
+You are currently rebasing branch 'zz_renaming_haplotypecallergenotypingengine' on 'cba4321'.
+  (fix conflicts and then run "git rebase --continue")
+  (use "git rebase --skip" to skip this patch)
+  (use "git rebase --abort" to check out the original branch)
+
+Unmerged paths:
+  (use "git reset HEAD <file>..." to unstage)
+  (use "git add/rm <file>..." as appropriate to mark resolution)
+
+    added by them:      protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+    both deleted:       protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+    added by them:      protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java
+    both deleted:       protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+    added by us:        protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java
+    added by us:        protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+<<< possible untracked files if your working directory is not clean>>>
+
+no changes added to commit (use "git add" and/or "git commit -a")
+$ 
+

Let's look at the main java file as an example. If you are having issues figuring out the new directory and new file name, they are all listed in the output.

+
Path in the common ancestor branch:
+ |      old source directory       |                     old package name                     |   old file name     |
+  protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+
+Path in the new master branch before merge:
+ |           new source directory             |                 new package name                    |   old file name     |
+  protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java
+
+Path in your branch before merge:
+ |      old source directory       |                     old package name                     |           new file name            |
+  protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+
+Path in your branch post merge:
+ |           new source directory             |                 new package name                    |           new file name            |
+  protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java    
+

After identifying the new paths for use post merge, use the following workflow for each file:

+
    +
  1. Move or copy your version of the renamed file to the new directory
  2. +
  3. git rm the three old file paths: common ancestor, old directory with new file name, and new directory with old file name
  4. +
  5. git add the new file name in the new directory
  6. +
+

After you process all files correctly, in the output of git status you should see the "all conflicts fixed" and all your files renamed.

+
$ git status
+rebase in progress; onto cba4321
+You are currently rebasing branch 'zz_renaming_haplotypecallergenotypingengine' on 'cba4321'.
+  (all conflicts fixed: run "git rebase --continue")
+
+Changes to be committed:
+  (use "git reset HEAD <file>..." to unstage)
+
+    renamed:    protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java -> protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+    renamed:    protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java -> protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+<<< possible untracked files if your working directory is not clean>>>
+
+$
+

Continue your rebase, handling other merges as normal.

+
$ git rebase --continue
+

Fixing imports

+

Because all the packages names are different in 3.2, while rebasing you may run into conflicts due to imports you also changed. Use your favorite editor to fix the imports within the files. Then try recompiling, and repeat as necessary until your code works.

+

While editing the files with conflicts with a basic text editor may work, IntelliJ IDEA also offers a special merge tool that may help via the menu:

+
VCS > Git > Resolve Conflicts...
+

For each file, click on the "Merge" button in the first dialog. Use the various buttons in the Conflict Resolution Tool to automatically accept any changes that are not in conflict. Then find any edit any remaining conflicts that require further manual intervention.

+

Once you begin editing the import statements in the three way merge tool, another IntelliJ IDEA 13.1 feature that may speed up repairing blocks of import statements is Multiple Selections. Find a block of import lines that need the same changes. Hold down the option key as you drag your cursor vertically down the edit point on each import line. Then begin typing or deleting text from the multiple lines.

+

Switching branches

+

Even after a successful merge, you may still run into stale GATK code or links from modifications before and after the 3.2 package renaming. To significantly reduce these chances, run mvn clean before and then again after switching branches.

+

If this doesn't work, run mvn clean && git status, looking for any directories you don't that shouldn't be in the current branch. It is possible that some files were not correctly moved, including classes or test resources. Find the file still in the old directories via a command such as find public/gatk-framework -type f. Then move them to the correct new directories and commit them into git.

+

Slow Builds with Queue and Private

+

Due to the [Renamed Binary Packages], the separate artifacts including and excluding private code are now packaged during the Maven package build lifecycle.

+

When building packages, to significantly speed up the default packaging time, if you only require the GATK tools run mvn verify -P\!queue.

+

Alternatively, if you do not require building private source, then disable private compiling via mvn verify -P\!private.

+

The two may be combined as well via: mvn verify -P\!queue,\!private.

+

The exclamation mark is a shell command that must be escaped, in the above case with a backslash. Shell quotes may also be used: mvn verify -P'!queue,!private'.

+

Alternatively, developers with access to private may often want to disable packaging the protected distributions. In this case, use the gsadev profile. This may be done via mvn verify -Pgsadev or, excluding Queue, mvn verify -Pgsadev,\!queue.

+

Stale symlinks

+

Users see errors from maven when an unclean repo in git is updated. +Because BaseTest.java currently hardcodes relative paths to +"public/testdata", maven creates these symbolic links all over the +file system to help the various tests in different modules find the +relative path "/public/testdata".

+

However, our Maven support has evolved from 2.8, to 3.0, to now the +3.2 renaming, each time has changed the symbolic link's target +directory. Whenever a stale symbolic link to an old testdata directory +remains in the users folder, maven is saying it will not remove the +link, because maven basically doesn't know why the link is pointing to +the wrong folder (answer, the link is from an old git checkout) and +thinks it's a bug in the build.

+

If one doesn't have an stale / unclean maven repo when updating git +via merge/rebase/checkout, you will never see this issue.

+

The script that can remove the stale symlinks, public/src/main/scripts/shell/delete_maven_links.sh, should run automatically during a mvn test-compile or mvn verify.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Tribble.md b/doc_archive/developer-zone/Tribble.md new file mode 100644 index 000000000..edaf0a0ad --- /dev/null +++ b/doc_archive/developer-zone/Tribble.md @@ -0,0 +1,119 @@ +## Tribble + +http://gatkforums.broadinstitute.org/gatk/discussion/1349/tribble + +

1. Overview

+

The Tribble project was started as an effort to overhaul our reference-ordered data system; we had many different formats that were shoehorned into a common framework that didn't really work as intended. What we wanted was a common framework that allowed for searching of reference ordered data, regardless of the underlying type. Jim Robinson had developed indexing schemes for text-based files, which was incorporated into the Tribble library.

+

2. Architecture Overview

+

Tribble provides a lightweight interface and API for querying features and creating indexes from feature files, while allowing iteration over know feature files that we're unable to create indexes for. The main entry point for external users is the BasicFeatureReader class. It takes in a codec, an index file, and a file containing the features to be processed. With an instance of a BasicFeatureReader, you can query for features that span a specific location, or get an iterator over all the records in the file.

+

3. Developer Overview

+

For developers, there are two important classes to implement: the FeatureCodec, which decodes lines of text and produces features, and the feature class, which is your underlying record type.

+ +

For developers there are two classes that are important:

+ +

To implement your new format into Tribble, you need to implement the two above classes (in an appropriately named subfolder in the Tribble check-out). The Feature object should know nothing about the file representation; it should represent the data as an in-memory object. The interface for a feature looks like:

+
public interface Feature {
+
+    /**
+     * Return the features reference sequence name, e.g chromosome or contig
+     */
+    public String getChr();
+
+    /**
+     * Return the start position in 1-based coordinates (first base is 1)
+     */
+    public int getStart();
+
+    /**
+     * Return the end position following 1-based fully closed conventions.  The length of a feature is
+     * end - start + 1;
+     */
+    public int getEnd();
+}
+

And the interface for FeatureCodec:

+
/**
+ * the base interface for classes that read in features.
+ * @param <T> The feature type this codec reads
+ */
+public interface FeatureCodec<T extends Feature> {
+    /**
+     * Decode a line to obtain just its FeatureLoc for indexing -- contig, start, and stop.
+     *
+     * @param line the input line to decode
+     * @return  Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is
+     * a comment)
+     */
+    public Feature decodeLoc(String line);
+
+    /**
+     * Decode a line as a Feature.
+     *
+     * @param line the input line to decode
+     * @return  Return the Feature encoded by the line,  or null if the line does not represent a feature (e.g. is
+     * a comment)
+     */
+    public T decode(String line);
+
+    /**
+     * This function returns the object the codec generates.  This is allowed to be Feature in the case where
+     * conditionally different types are generated.  Be as specific as you can though.
+     *
+     * This function is used by reflections based tools, so we can know the underlying type
+     *
+     * @return the feature type this codec generates.
+     */
+    public Class<T> getFeatureType();
+
+    /**  Read and return the header, or null if there is no header.
+     *
+     * @return header object
+     */
+    public Object readHeader(LineReader reader);
+}
+

4. Supported Formats

+

The following formats are supported in Tribble:

+ +

5. Updating the Tribble, htsjdk, and/or Picard library

+

Updating the revision of Tribble on the system is a relatively straightforward task if the following steps are taken.

+

NOTE: Any directory starting with ~ may be different on your machine, depending on where you cloned the various repositories for gsa-unstable, picard, and htsjdk.

+

A Maven script to install picard into the local repository is located under gsa-unstable/private/picard-maven. To operate, it requires a symbolic link named picard pointing to a working checkout of the picard github repository. NOTE: compiling picard requires an htsjdk github repository checkout available at picard/htsjdk, either as a subdirectory or another symbolic link. The final full path should be gsa-unstable/private/picard-maven/picard/htsjdk.

+
cd ~/src/gsa-unstable
+cd private/picard-maven
+ln -s ~/src/picard picard
+

Create a git branch of Picard and/or htsjdk and make your changes. To install your changes into the GATK you must run mvn install in the private/picard-maven directory. This will compile and copy the jars into gsa-unstable/public/repo, and update gsa-unstable/gatk-root/pom.xml with the corresponding version. While making changes your revision of picard and htslib will be labeled with -SNAPSHOT.

+
cd ~/src/gsa-unstable
+cd private/picard-maven
+mvn install
+

Continue testing in the GATK. Once your changes and updated tests for picard/htsjdk are complete, push your branch and submit your pull request to the Picard and/or htsjdk github. After your Picard/htsjdk patches are accepted, switch your Picard/htsjdk branches back to the master branch. NOTE: Leave your gsa-unstable branch on your development branch!

+
cd ~/src/picard
+ant clean
+git checkout master
+git fetch
+git rebase
+cd htsjdk
+git checkout master
+git fetch
+git rebase
+

NOTE: The version number of old and new Picard/htsjdk will vary, and during active development will end with -SNAPSHOT. While, if needed, you may push -SNAPSHOT version for testing on Bamboo, you should NOT submit a pull request with a -SNAPSHOT version. -SNAPSHOT indicates your local changes are not reproducible from source control.

+

When ready, run mvn install once more to create the non -SNAPSHOT versions under gsa-unstable/public/repo. In that directory, git add the new version, and git rm the old versions.

+
cd ~/src/gsa-unstable
+cd public/repo
+git add picard/picard/1.115.1499/
+git add samtools/htsjdk/1.115.1509/
+git rm -r picard/picard/1.112.1452/
+git rm -r samtools/htsjdk/1.112.1452/
+

Commit and then push your gsa-unstable branch, then issue a pull request for review.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md b/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md new file mode 100644 index 000000000..70477ec56 --- /dev/null +++ b/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md @@ -0,0 +1,102 @@ +## Using DiffEngine to summarize differences between structured data files + +http://gatkforums.broadinstitute.org/gatk/discussion/1299/using-diffengine-to-summarize-differences-between-structured-data-files + +

1. What is DiffEngine?

+

DiffEngine is a summarizing difference engine that allows you to compare two structured files -- such as BAMs and VCFs -- to find what are the differences between them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.

+

2. The summarized differences

+

The GATK contains a summarizing difference engine called DiffEngine that compares hierarchical data structures to emit:

+ +

3. The DiffObjects walker

+

The GATK contains a private walker called DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you.

+

4. Understanding the output

+

The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named nodes. Suppose I have two trees:

+
Tree1=(A=1 B=(C=2 D=3)) 
+Tree2=(A=1 B=(C=3 D=3 E=4))
+Tree3=(A=1 B=(C=4 D=3 E=4))
+

where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine traverses these data structures by name, identifies equivalent nodes by fully qualified names (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are).

+

These itemized differences are listed as:

+
Tree1.B.C=2 != Tree2.B.C=3
+Tree1.B.C=2 != Tree3.B.C=4
+Tree2.B.C=3 != Tree3.B.C=4
+Tree1.B.E=MISSING != Tree2.B.E=4
+

This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though is that it computes similarity among the itemized differences and displays the count of differences names in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs only once. So the summary is:

+
*.B.C : 3
+*.B.E : 1
+

where the * operator indicates that any named field matches. This output is sorted by counts, and provides an immediate picture of the commonly occurring differences between the files.

+

Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, detected by the integrationtest integration (more below). You can see that in the although there are many specific instances of these differences between the two files, the summarized differences provide an immediate picture that the AC, AF, and AN fields are the major causes of the differences.

+
[testng] path                                                              count
+[testng] *.*.*.AC                                                         6
+[testng] *.*.*.AF                                                         6
+[testng] *.*.*.AN                                                         6
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
+

5. Integration tests

+

The DiffEngine codebase that supports these calculations is integrated into the integrationtest framework, so that when a test fails the system automatically summarizes the differences between the master MD5 file and the failing MD5 file, if it is an understood type. When failing you will see in the integration test logs not only the basic information, but the detailed DiffEngine output.

+

For example, in the output below I broke the GATK BAQ calculation and the integration test DiffEngine clearly identifies that all of the records differ in their BQ tag value in the two BAM files:

+
/humgen/1kg/reference/human_b36_both.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam -o /var/folders/Us/UsMJ3xRrFVyuDXWkUos1xkC43FQ/-Tmp-/walktest.tmp_param.05785205687740257584.tmp -L 1:10,000,000-10,100,000 -baq RECALCULATE -et NO_ET
+   [testng] WARN  22:59:22,875 TextFormattingUtils - Unable to load help text.  Help output will be sparse.
+   [testng] WARN  22:59:22,875 TextFormattingUtils - Unable to load help text.  Help output will be sparse.
+   [testng] ##### MD5 file is up to date: integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] Checking MD5 for /var/folders/Us/UsMJ3xRrFVyuDXWkUos1xkC43FQ/-Tmp-/walktest.tmp_param.05785205687740257584.tmp [calculated=e5147656858fc4a5f470177b94b1fc1b, expected=4ac691bde1ba1301a59857694fda6ae2]
+   [testng] ##### Test testPrintReadsRecalBAQ is going fail #####
+   [testng] ##### Path to expected   file (MD5=4ac691bde1ba1301a59857694fda6ae2): integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest
+   [testng] ##### Path to calculated file (MD5=e5147656858fc4a5f470177b94b1fc1b): integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] ##### Diff command: diff integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] ##:GATKReport.v0.1 diffences : Summarized differences between the master and test files.
+   [testng] See http://www.broadinstitute.org/gsa/wiki/index.php/DiffObjectsWalker_and_SummarizedDifferences for more information
+   [testng] Difference                                                                               NumberOfOccurrences
+   [testng] *.*.*.BQ                                                                                 895
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:2:266:272:361.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:5:245:474:254.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:5:255:178:160.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:6:158:682:495.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:6:195:591:884.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:165:236:848.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:191:223:910.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:286:279:434.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAF_0002_FC205Y7AAXX:2:106:516:354.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAF_0002_FC205Y7AAXX:3:102:580:518.BQ  1
+   [testng]
+   [testng] Note that the above list is not comprehensive.  At most 20 lines of output, and 10 specific differences will be listed.  Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest -t integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest to explore the differences more freely
+

6. Adding your own DiffableObjects to the system

+

The system dynamically finds all classes that implement the following simple interface:

+
public interface DiffableReader {
+    @Ensures("result != null")
+    /**
+     * Return the name of this DiffableReader type.  For example, the VCF reader returns 'VCF' and the
+     * bam reader 'BAM'
+     */
+    public String getName();
+
+    @Ensures("result != null")
+    @Requires("file != null")
+    /**
+     * Read up to maxElementsToRead DiffElements from file, and return them.
+     */
+    public DiffElement readFromFile(File file, int maxElementsToRead);
+
+    /**
+     * Return true if the file can be read into DiffElement objects with this reader. This should
+     * be uniquely true/false for all readers, as the system will use the first reader that can read the
+     * file.  This routine should never throw an exception.  The VCF reader, for example, looks at the
+     * first line of the file for the ##format=VCF4.1 header, and the BAM reader for the BAM_MAGIC value
+     * @param file
+     * @return
+     */
+    @Requires("file != null")
+    public boolean canRead(File file);
+

See the VCF and BAMDiffableReaders for example implementations. If you extend this to a new object types both the DiffObjects walker and the integrationtest framework will automatically work with your new file type.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md b/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md new file mode 100644 index 000000000..28a73973b --- /dev/null +++ b/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md @@ -0,0 +1,56 @@ +## Writing GATKdocs for your walkers + +http://gatkforums.broadinstitute.org/gatk/discussion/1324/writing-gatkdocs-for-your-walkers + +

The GATKDocs are what we call "Technical Documentation" in the Guide section of this website. The HTML pages are generated automatically at build time from specific blocks of documentation in the source code.

+

The best place to look for example documentation for a GATK walker is GATKDocsExample walker in org.broadinstitute.sting.gatk.examples. This is available here.

+

Below is the reproduction of that file from August 11, 2011:

+
/**
+ * [Short one sentence description of this walker]
+ *
+ * <p>
+ * [Functionality of this walker]
+ * </p>
+ *
+ * <h2>Input</h2>
+ * <p>
+ * [Input description]
+ * </p>
+ *
+ * <h2>Output</h2>
+ * <p>
+ * [Output description]
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * PRE-TAG
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ * PRE-TAG
+ *
+ * @category Walker Category
+ * @author Your Name
+ * @since Date created
+ */
+public class GATKDocsExample extends RodWalker<Integer, Integer> {
+    /**
+     * Put detailed documentation about the argument here.  No need to duplicate the summary information
+     * in doc annotation field, as that will be added before this text in the documentation page.
+     *
+     * Notes:
+     * <ul>
+     *     <li>This field can contain HTML as a normal javadoc</li>
+     *     <li>Don't include information about the default value, as gatkdocs adds this automatically</li>
+     *     <li>Try your best to describe in detail the behavior of the argument, as ultimately confusing
+     *          docs here will just result in user posts on the forum</li>
+     * </ul>
+     */
+    @Argument(fullName="full", shortName="short", doc="Brief summary of argument [~ 80 characters of text]", required=false)
+    private boolean myWalkerArgument = false;
+
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
+    public Integer reduceInit() { return 0; }
+    public Integer reduce(Integer value, Integer sum) { return value + sum; }
+    public void onTraversalDone(Integer result) { }
+}
\ No newline at end of file diff --git a/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md b/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md new file mode 100644 index 000000000..1034b753a --- /dev/null +++ b/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md @@ -0,0 +1,60 @@ +## Writing and working with reference metadata classes + +http://gatkforums.broadinstitute.org/gatk/discussion/1350/writing-and-working-with-reference-metadata-classes + +

Brief introduction to reference metadata (RMDs)

+

Note that the -B flag referred to below is deprecated; these docs need to be updated

+

The GATK allows you to process arbitrary numbers of reference metadata (RMD) files inside of walkers (previously we called this reference ordered data, or ROD). Common RMDs are things like dbSNP, VCF call files, and refseq annotations. The only real constraints on RMD files is that:

+ +

Inside of the GATK the RMD system has the concept of RMD tracks, which associate an arbitrary string name with the data in the associated RMD file. For example, the VariantEval module uses the named track eval to get calls for evaluation, and dbsnp as the track containing the database of known variants.

+

How do I get reference metadata files into my walker?

+

RMD files are extremely easy to get into the GATK using the -B syntax:

+
java -jar GenomeAnalysisTK.jar -R Homo_sapiens_assembly18.fasta -T PrintRODs -B:variant,VCF calls.vcf
+

In this example, the GATK will attempt to parse the file calls.vcf using the VCF parser and bind the VCF data to the RMD track named variant.

+

In general, you can provide as many RMD bindings to the GATK as you like:

+
java -jar GenomeAnalysisTK.jar -R Homo_sapiens_assembly18.fasta -T PrintRODs -B:calls1,VCF calls1.vcf -B:calls2,VCF calls2.vcf
+

Works just as well. Some modules may require specifically named RMD tracks -- like eval above -- and some are happy to just assess all RMD tracks of a certain class and work with those -- like VariantsToVCF.

+

1. Directly getting access to a single named track

+

In this snippet from SNPDensityWalker, we grab the eval track as a VariantContext object, only for the variants that are of type SNP:

+
public Pair<VariantContext, GenomeLoc> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    VariantContext vc = tracker.getVariantContext(ref, "eval", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), false);
+}
+

2. Grabbing anything that's convertable to a VariantContext

+

From VariantsToVCF we call the helper function tracker.getVariantContexts to look at all of the RMDs and convert what it can to VariantContext objects.

+
Allele refAllele = new Allele(Character.toString(ref.getBase()), true);
+Collection<VariantContext> contexts = tracker.getVariantContexts(INPUT_RMD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), refAllele, true, false);
+

3. Looking at all of the RMDs

+

Here's a totally general code snippet from PileupWalker.java. This code, as you can see, iterates over all of the GATKFeature objects in the reference ordered data, converting each RMD to a string and capturing these strings in a list. It finally grabs the dbSNP binding specifically for a more detailed string conversion, and then binds them all up in a single string for display along with the read pileup.

+

private String getReferenceOrderedData( RefMetaDataTracker tracker ) { +ArrayList rodStrings = new ArrayList(); +for ( GATKFeature datum : tracker.getAllRods() ) { +if ( datum != null && ! (datum.getUnderlyingObject() instanceof DbSNPFeature)) { +rodStrings.add(((ReferenceOrderedDatum)datum.getUnderlyingObject()).toSimpleString()); // TODO: Aaron: this line still survives, try to remove it +} +} +String rodString = Utils.join(", ", rodStrings);

+
        DbSNPFeature dbsnp = tracker.lookup(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, DbSNPFeature.class);
+
+        if ( dbsnp != null)
+            rodString += DbSNPHelper.toMediumString(dbsnp);
+
+        if ( !rodString.equals("") )
+            rodString = "[ROD: " + rodString + "]";
+
+        return rodString;
+}
+

How do I write my own RMD types?

+

Tracks of reference metadata are loaded using the Tribble infrastructure. Tracks are loaded using the feature codec and underlying type information. See the Tribble documentation for more information.

+

Tribble codecs that are in the classpath are automatically found; the GATK discovers all classes that implement the FeatureCodec class. Name resolution occurs using the -B type parameter, i.e. if the user specified:

+
-B:calls1,VCF calls1.vcf
+

The GATK looks for a FeatureCodec called VCFCodec.java to decode the record type. Alternately, if the user specified:

+
-B:calls1,MYAwesomeFormat calls1.maft
+

THe GATK would look for a codec called MYAwesomeFormatCodec.java. This look-up is not case sensitive, i.e. it will resolve MyAwEsOmEfOrMaT as well, though why you would want to write something so painfully ugly to read is beyond us.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md b/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md new file mode 100644 index 000000000..ec23889e6 --- /dev/null +++ b/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md @@ -0,0 +1,133 @@ +## Writing unit tests for walkers + +http://gatkforums.broadinstitute.org/gatk/discussion/1339/writing-unit-tests-for-walkers + +

1. Testing core walkers is critical

+

Most GATK walkers are really too complex to easily test using the standard unit test framework. It's just not feasible to make artificial read piles and then extrapolate from simple tests passing whether the system as a whole is working correctly. However, we need some way to determine whether changes to the core of the GATK are altering the expected output of complex walkers like BaseRecalibrator or SingleSampleGenotyper. In additional to correctness, we want to make sure that the performance of key walkers isn't degrading over time, so that calling snps, cleaning indels, etc., isn't slowly creeping down over time. Since we are now using a bamboo server to automatically build and run unit tests (as well as measure their runtimes) we want to put as many good walker tests into the test framework so we capture performance metrics over time.

+

2. The WalkerTest framework

+

To make this testing process easier, we've created a WalkerTest framework that lets you invoke the GATK using command-line GATK commands in the JUnit system and test for changes in your output files by comparing the current ant build results to previous run via an MD5 sum. It's a bit coarse grain, but it will work to ensure that changes to key walkers are detected quickly by the system, and authors can either update the expected MD5s or go track down bugs.

+

The system is fairly straightforward to use. Ultimately we will end up with JUnit style tests in the unit testing structure. In the piece of code below, we have a piece of code that checks the MD5 of the SingleSampleGenotyper's GELI text output at LOD 3 and LOD 10.

+
package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Arrays;
+
+public class SingleSampleGenotyperTest extends WalkerTest {
+    @Test
+    public void testLOD() {
+        HashMap<Double, String> e = new HashMap<Double, String>();
+        e.put( 10.0, "e4c51dca6f1fa999f4399b7412829534" );
+        e.put( 3.0, "d804c24d49669235e3660e92e664ba1a" );
+
+        for ( Map.Entry<Double, String> entry : e.entrySet() ) {
+            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                   "-T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod " + entry.getKey(), 1,
+                    Arrays.asList(entry.getValue()));
+            executeTest("testLOD", spec);
+        }
+    }
+}
+

The fundamental piece here is to inherit from WalkerTest. This gives you access to the executeTest() function that consumes a WalkerTestSpec:

+
    public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s)
+

The WalkerTestSpec takes regular, command-line style GATK arguments describing what you want to run, the number of output files the walker will generate, and your expected MD5s for each of these output files. The args string can contain %s String.format specifications, and for each of the nOutputFiles, the executeTest() function will (1) generate a tmp file for output and (2) call String.format on your args to fill in the tmp output files in your arguments string. For example, in the above argument string varout is followed by %s, so our single SingleSampleGenotyper output is the variant output file.

+

3. Example output

+

When you add a walkerTest inherited unit test to the GATK, and then build test, you'll see output that looks like:

+
[junit] WARN  13:29:50,068 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:29:50,068 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:29:50,069 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05524470250256847817.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 3.0
+[junit]  
+[junit] WARN  13:29:50,069 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05524470250256847817.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 3.0
+[junit]  
+[junit] WARN  13:30:39,407 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.05524470250256847817.tmp [calculated=d804c24d49669235e3660e92e664ba1a, expected=d804c24d49669235e3660e92e664ba1a] 
+[junit] WARN  13:30:39,407 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.05524470250256847817.tmp [calculated=d804c24d49669235e3660e92e664ba1a, expected=d804c24d49669235e3660e92e664ba1a] 
+[junit] WARN  13:30:39,408 WalkerTest -   => testLOD PASSED 
+[junit] WARN  13:30:39,408 WalkerTest -   => testLOD PASSED 
+[junit] WARN  13:30:39,409 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:30:39,409 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:30:39,409 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.03852477489430798188.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 10.0
+[junit]  
+[junit] WARN  13:30:39,409 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.03852477489430798188.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 10.0
+[junit]  
+[junit] WARN  13:31:30,213 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.03852477489430798188.tmp [calculated=e4c51dca6f1fa999f4399b7412829534, expected=e4c51dca6f1fa999f4399b7412829534] 
+[junit] WARN  13:31:30,213 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.03852477489430798188.tmp [calculated=e4c51dca6f1fa999f4399b7412829534, expected=e4c51dca6f1fa999f4399b7412829534] 
+[junit] WARN  13:31:30,213 WalkerTest -   => testLOD PASSED 
+[junit] WARN  13:31:30,213 WalkerTest -   => testLOD PASSED 
+[junit] WARN  13:31:30,214 SingleSampleGenotyperTest -  
+[junit] WARN  13:31:30,214 SingleSampleGenotyperTest -  
+

4. Recommended location for GATK testing data

+

We keep all of the permenant GATK testing data in:

+
/humgen/gsa-scr1/GATK_Data/Validation_Data/
+

A good set of data to use for walker testing is the CEU daughter data from 1000 Genomes:

+
gsa2 ~/dev/GenomeAnalysisTK/trunk > ls -ltr /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_1*.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_1*.calls
+-rw-rw-r--+ 1 depristo wga  51M 2009-09-03 07:56 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam
+-rw-rw-r--+ 1 depristo wga 185K 2009-09-04 13:21 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
+-rw-rw-r--+ 1 depristo wga 164M 2009-09-04 13:22 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls
+-rw-rw-r--+ 1 depristo wga  24M 2009-09-04 15:00 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam
+-rw-rw-r--+ 1 depristo wga  12M 2009-09-04 15:01 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.454.bam
+-rw-r--r--+ 1 depristo wga  91M 2009-09-04 15:02 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam
+

5. Test dependencies

+

The tests depend on a variety of input files, that are generally constrained to three mount points on the internal Broad network:

+
*/seq/
+*/humgen/1kg/
+*/humgen/gsa-hpprojects/GATK/Data/Validation_Data/
+

To run the unit and integration tests you'll have to have access to these files. They may have different mount points on your machine (say, if you're running remotely over the VPN and have mounted the directories on your own machine).

+

6. MD5 database and comparing MD5 results

+

Every file that generates an MD5 sum as part of the WalkerTest framework will be copied to <MD5>. integrationtest in the integrationtests subdirectory of the GATK trunk. This MD5 database of results enables you to easily examine the results of an integration test as well as compare the results of a test before/after a code change. For example, below is an example test for the UnifiedGenotyper that, due to a code change, where the output VCF differs from the VCF with the expected MD5 value in the test code itself. The test provides provides the path to the two results files as well as a diff command to compare expected to the observed MD5:

+
[junit] --------------------------------------------------------------------------------    
+[junit] Executing test testParameter[-genotype] with GATK arguments: -T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05997727998894311741.tmp -L 1:10,000,000-10,010,000 -genotype    
+[junit] ##### MD5 file is up to date: integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest    
+[junit] Checking MD5 for /tmp/walktest.tmp_param.05997727998894311741.tmp [calculated=ab20d4953b13c3fc3060d12c7c6fe29d, expected=0ac7ab893a3f550cb1b8c34f28baedf6]    
+[junit] ##### Test testParameter[-genotype] is going fail #####    
+[junit] ##### Path to expected   file (MD5=0ac7ab893a3f550cb1b8c34f28baedf6): integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest    
+[junit] ##### Path to calculated file (MD5=ab20d4953b13c3fc3060d12c7c6fe29d): integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest    
+[junit] ##### Diff command: diff integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest
+

Examining the diff we see a few lines that have changed the DP count in the new code

+
> diff integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest  | head
+385,387c385,387
+< 1     10000345        .       A       .       106.54  .       AN=2;DP=33;Dels=0.00;MQ=89.17;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:25:-0.09,-7.57,-75.74:74.78
+< 1     10000346        .       A       .       103.75  .       AN=2;DP=31;Dels=0.00;MQ=88.85;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:24:-0.07,-7.27,-76.00:71.99
+< 1     10000347        .       A       .       109.79  .       AN=2;DP=31;Dels=0.00;MQ=88.85;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:26:-0.05,-7.85,-84.74:78.04
+---
+> 1     10000345        .       A       .       106.54  .       AN=2;DP=32;Dels=0.00;MQ=89.50;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:25:-0.09,-7.57,-75.74:74.78
+> 1     10000346        .       A       .       103.75  .       AN=2;DP=30;Dels=0.00;MQ=89.18;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:24:-0.07,-7.27,-76.00:71.99
+> 1     10000347        .       A       .       109.79  .       AN=2;DP=30;Dels=0.00;MQ=89.18;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:26:-0.05,-7.85,-84.74:78
+

Whether this is the expected change is up to you to decide, but the system makes it as easy as possible to see the consequences of your code change.

+

7. Testing for Exceptions

+

The walker test framework supports an additional syntax for ensuring that a particular java Exception is thrown when a walker executes using a simple alternate version of the WalkerSpec object. Rather than specifying the MD5 of the result, you can provide a single subclass of Exception.class and the testing framework will ensure that when the walker runs an instance (class or subclass) of your expected exception is thrown. The system also flags if no exception is thrown.

+

For example, the following code tests that the GATK can detect and error out when incompatible VCF and FASTA files are given:

+
@Test public void fail8() { executeTest("hg18lex-v-b36", test(lexHG18, callsB36)); }
+
+private WalkerTest.WalkerTestSpec test(String ref, String vcf) {
+    return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
+            + vcf + " -F POS,CHROM -R "
+            + ref +  " -o %s",
+            1, UserException.IncompatibleSequenceDictionaries.class);
+
+}
+

During the integration test this looks like:

+
[junit] Executing test hg18lex-v-b36 with GATK arguments: -T VariantsToTable -M 10 -B:two,vcf /humgen/gsa-hpprojects/GATK/data/Validation_Data/lowpass.N3.chr1.raw.vcf -F POS,CHROM -R /humgen/gsa-hpprojects/GATK/data/Validation_Data/lexFasta/lex.hg18.fasta -o /tmp/walktest.tmp_param.05541601616101756852.tmp -l WARN -et NO_ET
+[junit]    [junit] Wanted exception class org.broadinstitute.sting.utils.exceptions.UserException$IncompatibleSequenceDictionaries, saw class org.broadinstitute.sting.utils.exceptions.UserException$IncompatibleSequenceDictionaries
+[junit]   => hg18lex-v-b36 PASSED
+

8. Miscellaneous information

+ \ No newline at end of file diff --git a/doc_archive/developer-zone/Writing_walkers.md b/doc_archive/developer-zone/Writing_walkers.md new file mode 100644 index 000000000..8afc2fbb9 --- /dev/null +++ b/doc_archive/developer-zone/Writing_walkers.md @@ -0,0 +1,68 @@ +## Writing walkers + +http://gatkforums.broadinstitute.org/gatk/discussion/1302/writing-walkers + +

1. Introduction

+

The core concept behind GATK tools is the walker, a class that implements the three core operations: filtering, mapping, and reducing.

+ +

Users of the GATK will provide a walker to run their analyses. The engine will produce a result by first filtering the dataset, running a map operation, and finally reducing the map operation to a single result.

+

2. Creating a Walker

+

To be usable by the GATK, the walker must satisfy the following properties:

+ +

3. Examples

+

The best way to get started with the GATK is to explore the walkers we've written. Here are the best walkers to look at when getting started:

+ +

$STING_HOME/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java

+ +

$STING_HOME/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java

+ +

$STING_HOME/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java

+

Please note that the walker above is NOT the UnifiedGenotyper. While conceptually similar to the UnifiedGenotyper, the GATKPaperGenotyper uses a much simpler calling model for increased clarity and readability.

+

4. External walkers and the 'external' directory

+

The GATK can absorb external walkers placed in a directory of your choosing. By default, that directory is called 'external' and is relative to the Sting git root directory (for example, ~/src/Sting/external). However, you can choose to place that directory anywhere on the filesystem and specify its complete path using the ant external.dir property.

+
ant -Dexternal.dir=~/src/external
+

The GATK will check each directory under the external directory (but not the external directory itself!) for small build scripts. These build scripts must contain at least a compile target that compiles your walker and places the resulting class file into the GATK's class file output directory. The following is a sample compile target:

+
<target name="compile" depends="init">
+    <javac srcdir="." destdir="${build.dir}" classpath="${gatk.classpath}" />
+</target>
+

As a convenience, the build.dir ant property will be predefined to be the GATK's class file output directory and the gatk.classpath property will be predefined to be the GATK's core classpath. Once this structure is defined, any invocation of the ant build scripts will build the contents of the external directory as well as the GATK itself.

\ No newline at end of file diff --git a/doc_archive/developer-zone/Writing_walkers_in_Scala.md b/doc_archive/developer-zone/Writing_walkers_in_Scala.md new file mode 100644 index 000000000..467169972 --- /dev/null +++ b/doc_archive/developer-zone/Writing_walkers_in_Scala.md @@ -0,0 +1,55 @@ +## Writing walkers in Scala + +http://gatkforums.broadinstitute.org/gatk/discussion/1354/writing-walkers-in-scala + +

1. Install scala somewhere

+

At the Broad, we typically put it somewhere like this:

+
/home/radon01/depristo/work/local/scala-2.7.5.final
+

Next, create a symlink from this directory to trunk/scala/installation:

+
ln -s /home/radon01/depristo/work/local/scala-2.7.5.final trunk/scala/installation
+

2. Setting up your path

+

Right now the only way to get scala walkers into the GATK is by explicitly setting your CLASSPATH in your .my.cshrc file:

+
setenv CLASSPATH /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/FourBaseRecaller.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/Playground.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/StingUtils.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/bcel-5.2.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/colt-1.2.0.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/google-collections-0.9.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/javassist-3.7.ga.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/junit-4.4.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/log4j-1.2.15.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/picard-1.02.63.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/picard-private-875.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/reflections-0.9.2.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/sam-1.01.63.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/simple-xml-2.0.4.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar
+

Really this needs to be manually updated whenever any of the libraries are updated. If you see this error:

+
Caused by: java.lang.RuntimeException: java.util.zip.ZipException: error in opening zip file
+        at org.reflections.util.VirtualFile.iterable(VirtualFile.java:79)
+        at org.reflections.util.VirtualFile$5.transform(VirtualFile.java:169)
+        at org.reflections.util.VirtualFile$5.transform(VirtualFile.java:167)
+        at org.reflections.util.FluentIterable$3.transform(FluentIterable.java:43)
+        at org.reflections.util.FluentIterable$3.transform(FluentIterable.java:41)
+        at org.reflections.util.FluentIterable$ForkIterator.computeNext(FluentIterable.java:81)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.util.FluentIterable$FilterIterator.computeNext(FluentIterable.java:102)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.util.FluentIterable$TransformIterator.computeNext(FluentIterable.java:124)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.Reflections.scan(Reflections.java:69)
+        at org.reflections.Reflections.<init>(Reflections.java:47)
+        at org.broadinstitute.sting.utils.PackageUtils.<clinit>(PackageUtils.java:23)
+

It's because the libraries aren't updated. Basically just do an ls of your trunk/dist directory after the GATK has been build, make this your classpath as above, and tack on:

+
/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar
+

A command that almost works (but you'll need to replace the spaces with colons) is:

+
#setenv CLASSPATH $CLASSPATH `ls /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/*.jar` /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar
+

3. Building scala code

+

All of the Scala source code lives in scala/src, which you build using ant scala

+

There are already some example Scala walkers in scala/src, so doing a standard checkout, installing scala, settting up your environment, should allow you to run something like:

+
gsa2 ~/dev/GenomeAnalysisTK/trunk > ant scala
+Buildfile: build.xml
+
+init.scala:
+
+scala:
+     [echo] Sting: Compiling scala!
+   [scalac] Compiling 2 source files to /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/scala/classes
+   [scalac] warning: there were deprecation warnings; re-run with -deprecation for details
+   [scalac] one warning found
+   [scalac] Compile suceeded with 1 warning; see the compiler output for details.
+   [delete] Deleting: /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar
+      [jar] Building jar: /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar
+

4. Invoking a scala walker

+

Until we can include Scala walkers along with the main GATK jar (avoiding the classpath issue too) you have to invoke your scala walkers using this syntax:

+
java -Xmx2048m org.broadinstitute.sting.gatk.CommandLineGATK -T BaseTransitionTableCalculator -R /broad/1KG/reference/human_b36_both.fasta -I /broad/1KG/DCC_merged/freeze5/NA12878.pilot2.SLX.bam -l INFO -L 1:1-100
+

Here, the BaseTransitionTableCalculator walker is written in Scala and being loaded into the system by the GATK walker manager. Otherwise everything looks like a normal GATK module.

\ No newline at end of file diff --git a/doc_archive/dictionary/Bait_bias.md b/doc_archive/dictionary/Bait_bias.md new file mode 100644 index 000000000..1812860d0 --- /dev/null +++ b/doc_archive/dictionary/Bait_bias.md @@ -0,0 +1,6 @@ +## Bait bias + +http://gatkforums.broadinstitute.org/gatk/discussion/6333/bait-bias + +

Bait bias (single bait bias or reference bias artifact) is a type of artifact that affects data generated through hybrid selection methods.

+

These artifacts occur during or after the target selection step, and correlate with substitution rates that are biased or higher for sites having one base on the reference/positive strand relative to sites having the complementary base on that strand. For example, a G>T artifact during the target selection step might result in a higher (G>T)/(C>A) substitution rate at sites with a G on the positive strand (and C on the negative), relative to sites with the flip (C positive)/(G negative). This is known as the "G-Ref" artifact.

\ No newline at end of file diff --git a/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md b/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md new file mode 100644 index 000000000..37cff1636 --- /dev/null +++ b/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md @@ -0,0 +1,19 @@ +## Biallelic vs Multiallelic sites + +http://gatkforums.broadinstitute.org/gatk/discussion/6455/biallelic-vs-multiallelic-sites + +

A biallelic site is a specific locus in a genome that contains two observed alleles, counting the reference as one, and therefore allowing for one variant allele. In practical terms, this is what you would call a site where, across multiple samples in a cohort, you have evidence for a single non-reference allele. Shown below is a toy example in which the consensus sequence for samples 1-3 have a deletion at position 7. Sample 4 matches the reference. This is considered a biallelic site because there are only two possible alleles-- a deletion, or the reference allele G.

+
           1 2 3 4 5 6 7 8 9
+Reference: A T A T A T G C G
+Sample 1 : A T A T A T - C G
+Sample 2 : A T A T A T - C G
+Sample 3 : A T A T A T - C G
+Sample 4 : A T A T A T G C G
+
+

A multiallelic site is a specific locus in a genome that contains three or more observed alleles, again counting the reference as one, and therefore allowing for two or more variant alleles. This is what you would call a site where, across multiple samples in a cohort, you see evidence for two or more non-reference alleles. Show below is a toy example in which the consensus sequences for samples 1-3 have a deletion or a SNP at the 7th position. Sample 4 matches the reference. This is considered a multiallelic site because there are four possible alleles-- a deletion, the reference allele G, a C (SNP), or a T (SNP). True multiallelic sites are not observed very frequently unless you look at very large cohorts, so they are often taken as a sign of a noisy region where artifacts are likely.

+
           1 2 3 4 5 6 7 8 9
+Reference: A T A T A T G C G
+Sample 1 : A T A T A T - C G
+Sample 2 : A T A T A T C C G
+Sample 3 : A T A T A T T C G
+Sample 4 : A T A T A T G C G
\ No newline at end of file diff --git a/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md b/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md new file mode 100644 index 000000000..b1737b568 --- /dev/null +++ b/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md @@ -0,0 +1,6 @@ +## Bisulfite sequencing / Cytosine methylation + +http://gatkforums.broadinstitute.org/gatk/discussion/6330/bisulfite-sequencing-cytosine-methylation + +

Cytosine methylation is a key component in epigenetic regulation of gene expression and frequently occurs at CpG sites throughout the genome. Bisulfite sequencing is a technique used to analyze the genome-wide methylation profiles on a single nucleotide level [doi:10.1093/nar/gki901]. Sodium bisulfite efficiently and selectively deaminates unmethylated cytosine residues to uracil without affecting 5-methyl cytosine (methylated). Using restriction enzymes and PCR to enrich for regions of the genome that have high CpG content, the resulting reduced genome comprises ~1% of the original genome but includes key regulatory sequences as well as repeated regions.

+

The protocol involves several steps. First, genomic DNA is digested with a restriction endonuclease such as MspI, which targets CG dinucleotides. This results in DNA fragments with CG at the ends. Next, the fragments are size selected (via gel electrophoresis), which facilitates the enrichment of CpG-containing sequences. This is followed by bisulfite treatment, which converts unmethylated C nucleotides to uracil (U) while methylated cytosines will remain intact. The bisulfite-treated DNA is amplified with a proofreading-deficient DNA polymerase to facilitate amplification of both methylated cytosines as well as the C -> U converted bases. Subsequent to PCR amplification, each original unmethylated cytosine will be converted to either a T (+ strand) or an A (- strand), while methylated C will remain a C (+ strand) or a G (- strand). The PCR products are then sequenced using conventional methods and aligned to a reference.

\ No newline at end of file diff --git a/doc_archive/dictionary/Downsampling.md b/doc_archive/dictionary/Downsampling.md new file mode 100644 index 000000000..15d22fa2e --- /dev/null +++ b/doc_archive/dictionary/Downsampling.md @@ -0,0 +1,44 @@ +## Downsampling + +http://gatkforums.broadinstitute.org/gatk/discussion/1323/downsampling + +

Downsampling is a process by which read depth is reduced, either at a particular position or within a region.

+

Normal sequencing and alignment protocols can often yield pileups with vast numbers of reads aligned to a single section of the genome in otherwise well-behaved datasets. Because of the frequency of these 'speed bumps', the GATK now downsamples pileup data unless explicitly overridden.

+

Note that there is also a proportional "downsample to fraction" mechanism that is mostly intended for testing the effect of different overall coverage means on analysis results.

+

See below for details of how this is implemented and controlled in GATK.

+
+

1. Downsampling to a target coverage

+

The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes unreasonable computational costs. The downsampling process takes two different forms depending on the type of analysis it is used with. For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than requested.

+

Defaults

+

The GATK's default downsampler (invoked by -dcov) exhibits the following properties:

+ +

By default, the downsampler is limited to 1000 reads per sample. This value can be adjusted either per-walker or per-run.

+

Customizing

+

From the command line:

+ +

To modify the walker's default behavior:

+ +

Algorithm details

+

The downsampler algorithm is designed to maintain uniform coverage while preserving a low memory footprint in regions of especially deep data. Given an already established pileup, a single-base locus, and a pile of reads with an alignment start of single-base locus + 1, the outline of the algorithm is as follows:

+

For each sample:

+ +

Now walk backward through each set of reads having the same alignment start. If the count of reads having the same alignment start is > 1, throw out one randomly selected read.

+ +
+

2. Downsampling to a fraction of the coverage

+

Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target coverage you need to aim for in order to obtain enough coverage in all loci of interest.

\ No newline at end of file diff --git a/doc_archive/dictionary/Heterozygosity.md b/doc_archive/dictionary/Heterozygosity.md new file mode 100644 index 000000000..5c5cc4679 --- /dev/null +++ b/doc_archive/dictionary/Heterozygosity.md @@ -0,0 +1,9 @@ +## Heterozygosity + +http://gatkforums.broadinstitute.org/gatk/discussion/8603/heterozygosity + +

Heterozygosity in population genetics

+

In the context of population genetics, heterozygosity can refer to the fraction of individuals in a given population that are heterozygous at a given locus, or the fraction of loci that are heterozygous in an individual. See the Wikipedia entries on Heterozygosity and Coalescent Theory as well as the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on related theory.

+

Heterozygosity in GATK

+

In GATK genotyping, we use an "expected heterozygosity" value to compute the prior probability that a locus is non-reference. Given the expected heterozygosity hets, we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i). The default value provided for humans is hets = 1e-3; a value of 0.001 implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp. In this context hets is analogous to the parameter theta from population genetics. The hets parameter value can be modified if desired.

+

Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there may be an AB heterozygous genotype. The posterior probability of this AB genotype would use the hets prior, but the GATK only uses this posterior probability in determining the probability that a site is polymorphic. So changing the hets parameters only increases the chance that a site will be called non-reference across all samples, but doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities. The one quantity that changes whether the GATK considers the possibility of a heterozygous genotype at all is the ploidy, which describes how many copies of each chromosome each individual in the species carries.

\ No newline at end of file diff --git a/doc_archive/dictionary/Hybrid_selection.md b/doc_archive/dictionary/Hybrid_selection.md new file mode 100644 index 000000000..635d651a5 --- /dev/null +++ b/doc_archive/dictionary/Hybrid_selection.md @@ -0,0 +1,8 @@ +## Hybrid selection + +http://gatkforums.broadinstitute.org/gatk/discussion/6331/hybrid-selection + +

Hybrid selection is a method that enables selection of specific sequences from a pool of genomic DNA for targeted sequencing analyses via pull-down assays. Typical applications include the selection of exome sequences or pathogen-specific sequences in complex biological samples. Hybrid selection involve the use baits to select desired fragments.

+

Briefly, baits are RNA (or sometimes DNA) molecules synthesized with biotinylated nucleotides. The biotinylated nucleotides are ligands for streptavidin enabling enabling RNA:DNA hybrids to be captured in solution. The hybridization targets are sheared genomic DNA fragments, which have been "polished" with synthetic adapters to facilitate PCR cloning downstream. Hybridization of the baits with the denatured targets is followed by selective capture of the RNA:DNA "hybrids" using streptavidin-coated beads via pull-down assays or columns.

+

Systematic errors, ultimately leading to sequence bias and incorrect variant calls, can arise at several steps. See the GATK dictionary entries bait bias and pre-adapter artifacts for more details.

+

Please see the following reference for the theory behind this technique.

\ No newline at end of file diff --git a/doc_archive/dictionary/Jumping_libraries.md b/doc_archive/dictionary/Jumping_libraries.md new file mode 100644 index 000000000..9221bf774 --- /dev/null +++ b/doc_archive/dictionary/Jumping_libraries.md @@ -0,0 +1,5 @@ +## Jumping libraries + +http://gatkforums.broadinstitute.org/gatk/discussion/6326/jumping-libraries + +

Jumping libraries are created to bypass difficult to align/map regions, such as those containing repetitive DNA sequences. Briefly, the DNA of interest is identified, cut into fragments either with restriction enzymes or by shearing. The size-selected fragments are ligated to adapters for bead-capture and circularized. After bead-capture, the DNA is linearized via restriction enzymes, and can be sequenced using adapter primers facing in outward [reverse/forward (RF)] directions. These library inserts are considered jumping because the ends originate from distal genomic DNA sequences and are ligated adjacent to one another during circularization. Potential artifacts of this method include small inserts (lacking the linearizing restriction enzyme sequence), which are inward-facing [forward/reverse (FR)] (non-jumping) read pairs. In addition, chimeras result from the paired ends falling on different chromosomes, the insert size exceeding the maximum of 100 KB, or two times the mode of the insert size for outward-facing pairs. For additional information, see the Wikipedia article.

\ No newline at end of file diff --git a/doc_archive/dictionary/Likelihoods_and_Probabilities.md b/doc_archive/dictionary/Likelihoods_and_Probabilities.md new file mode 100644 index 000000000..9c4ba0aef --- /dev/null +++ b/doc_archive/dictionary/Likelihoods_and_Probabilities.md @@ -0,0 +1,16 @@ +## Likelihoods and Probabilities + +http://gatkforums.broadinstitute.org/gatk/discussion/7860/likelihoods-and-probabilities + +

There are several instances in the GATK documentation where you will encounter the terms "likelihood" and "probability", because key tools in the variant discovery workflow rely heavily on Bayesian statistics. For example, the HaplotypeCaller, our most prominent germline SNP and indel caller, uses Bayesian statistics to determine genotypes.

+

So what do likelihood and probability mean and how are they related to each other in the Bayesian context?

+

In Bayesian statistics (as opposed to frequentist statistics), we are typically trying to evaluate the posterior probability of a hypothesis (H) based on a series of observations (data, D).

+

Bayes' rule states that

+

$${P(H|D)}=\frac{P(H)P(D|H)}{P(D)}$$

+

where the bit we care about most, P(D|H), is the probability of observing D given the hypothesis H. This can also be formulated as L(H|D), i.e. the likelihood of the hypothesis H given the observation D:

+

$$P(D|H)=L(H|D)$$

+

We use the term likelihood instead of probability to describe the term on the right because we cannot calculate a meaningful probability distribution on a hypothesis, which by definition is binary (it will either be true or false) -- but we can determine the likelihood that a hypothesis is true or false given a set of observations. For a more detailed explanation of these concepts, please see the following lesson (http://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading11.pdf).

+

Now you may wonder, what about the posterior probability P(H|D) that we eventually calculate through Bayes' rule? Isn't that a "probability of a hypothesis"? Well yes; in Bayesian statistics, we can calculate a posterior probability distribution on a hypothesis, because its probability distribution is relative to all of the other competing hypotheses (http://www.smbc-comics.com/index.php?id=4127). Tadaa.

+

See this HaplotypeCaller doc article for a worked out explanation of how we calculate and use genotype likelihoods in germline variant calling.

+

So always remember this, if nothing else: the terms likelihood and probability are not interchangeable in the Bayesian context, even though they are often used interchangeably in common English.

+

A special thanks to Jon M. Bloom PhD (MIT) for his assistance in the preparation of this article.

\ No newline at end of file diff --git a/doc_archive/dictionary/Mate_unmapped_records.md b/doc_archive/dictionary/Mate_unmapped_records.md new file mode 100644 index 000000000..5b2286dcd --- /dev/null +++ b/doc_archive/dictionary/Mate_unmapped_records.md @@ -0,0 +1,19 @@ +## Mate unmapped records + +http://gatkforums.broadinstitute.org/gatk/discussion/6976/mate-unmapped-records + +

Mate unmapped records are identifiable using the 8 SAM flag.

+

It is possible for a BAM to have multiple types of mate-unmapped records. These mate unmapped records are distinct from mate missing records, where the mate is altogether absent from the BAM. Of the three types of mate unmapped records listed below, we describe only the first two in this dictionary entry.

+
    +
  1. Singly mapping pair.
  2. +
  3. A secondary/supplementary record is flagged as mate-unmapped but the mate is in fact mapped.
  4. +
  5. Both reads in a pair are unmapped.
  6. +
+
+

(1) Singly mapping pair

+

A mapped read's unmapped mate is marked in their SAM record in an unexpected manner that allow the pair to sort together. If you look at these unmapped reads, the alignment columns 2 and 3 indicate they align, in fact identically to the mapped mate. However, what is distinct is the asterisk * in the CIGAR field (column 6) that indicates the record is unmapped. This allows us to (i) identify the unmapped read as having passed through the aligner, and (ii) keep the pairs together in file manipulations that use either coordinate or queryname sorted BAMs. For example, when a genomic interval of reads are taken to create a new BAM, the pair remain together. For file manipulations dependent on such sorting, we can deduce that these mate unmapped records are immune to becoming missing mates.

+

(2) Mate unmapped record whose mate is mapped but in a pair that excludes the record

+

The second type of mate unmapped records apply to multimapping read sets processed through MergeBamAlignment such as in Tutorial#6483. Besides reassigning primary and secondary flags within multimapping sets according to a user specified strategy, MergeBamAlignment marks secondary records with the mate unmapped flag. Specifically, after BWA-MEM alignment, records in multimapping sets are all each mate-mapped. After going through MergeBamAlignment, the secondary records become mate-unmapped. The primary alignments remain mate-mapped. This effectively minimizes the association between secondary records from their previous mate.

+
+

How do tools treat them differently?

+

GATK tools typically ignore secondary/supplementary records from consideration. However, tools will process the mapped read in a singly mapping pair. For example, MarkDuplicates skips secondary records from consideration but marks duplicate singly mapping reads.

\ No newline at end of file diff --git a/doc_archive/dictionary/OxoG_oxidative_artifacts.md b/doc_archive/dictionary/OxoG_oxidative_artifacts.md new file mode 100644 index 000000000..8373bab87 --- /dev/null +++ b/doc_archive/dictionary/OxoG_oxidative_artifacts.md @@ -0,0 +1,12 @@ +## OxoG oxidative artifacts + +http://gatkforums.broadinstitute.org/gatk/discussion/6328/oxog-oxidative-artifacts + +

Oxidation of guanine to 8-oxoguanine is one of the most common pre-adapter artifacts associated with genomic library preparation, arising from a combination of heat, shearing, and metal contaminates in a sample (doi: 10.1093/nar/gks1443). The 8-oxoguanine base can pair with either cytosine or adenine, ultimately leading to G→T transversion mutations during PCR amplification.

+

This occurs when a G on the template strand is oxidized, giving it an affinity for binding to A rather than the usual C. Thus, PCR will introduce apparent G>T substitutions in read 1 and C>A in read 2. In the resulting alignments, a given G>T or C>A observation could either be:

+
    +
  1. a true mutation
  2. +
  3. an 8-oxoguanine artifact
  4. +
  5. some other kind of artifact.
  6. +
+

The variants (C→A)/(G→T) tend to occur in specific sequence contexts e.g. CCG→CAG (doi:10.1093/nar/gks1443). Although occurring at relatively low frequencies, these artifacts can have profound impacts on variant calling fidelity (doi:10.1093/nar/gks1443).

\ No newline at end of file diff --git a/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md b/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md new file mode 100644 index 000000000..61d600da5 --- /dev/null +++ b/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md @@ -0,0 +1,11 @@ +## PF reads / Illumina chastity filter + +http://gatkforums.broadinstitute.org/gatk/discussion/6329/pf-reads-illumina-chastity-filter + +

Illumina sequencers perform an internal quality filtering procedure called chastity filter, and reads that pass this filter are called PF for pass-filter. According to Illumina, chastity is defined as the ratio of the brightest base intensity divided by the sum of the brightest and second brightest base intensities. Clusters of reads pass the filter if no more than 1 base call has a chastity value below 0.6 in the first 25 cycles. This filtration process removes the least reliable clusters from the image analysis results.

+

For additional information on chastity filters, please see:

+ +

Both articles can be found at http://www.Illumina.com

\ No newline at end of file diff --git a/doc_archive/dictionary/Paired-end___mate-pair.md b/doc_archive/dictionary/Paired-end___mate-pair.md new file mode 100644 index 000000000..ce3c7b60d --- /dev/null +++ b/doc_archive/dictionary/Paired-end___mate-pair.md @@ -0,0 +1,18 @@ +## Paired-end / mate-pair + +http://gatkforums.broadinstitute.org/gatk/discussion/6327/paired-end-mate-pair + +

In paired-end sequencing, the library preparation yields a set of fragments, and the machine sequences each fragment from both ends; for example if you have a 300bp contiguous fragment, the machine will sequence e.g. bases 1-75 (forward direction) and bases 225-300 (reverse direction) of the fragment.

+

In mate-pair sequencing, the library preparation yields two fragments that are distal to each other in the genome and in the opposite in orientation to that of a mate-paired fragment.

+

The three read orientation categories are forward reverse (FR), reverse forward (RF), and reverse-reverse/forward-forward (TANDEM). In general, paired-end reads tend to be in a FR orientation, have relatively small inserts (~300 - 500 bp), and are particularly useful for the sequencing of fragments that contain short repeat regions. Mate-pair fragments are generally in a RF conformation, contain larger inserts (~3 kb), and enable sequence coverage of genomic regions containing large structural rearrangements. Tandem reads can result from inversions and rearrangements during library preparation.

+

Here is a more illustrative example:

+

FR: 5' --F--> <--R-- 5' (in slang called "innie" because they point inward)

+

RF: <--R-- 5' 5' --F--> (in slang called "outie" because they point outward)

+

TANDEM: 5' --F--> 5' --F--> or <--R-- 5' <--R-- 5'

+

The figure below illustrates this graphically along with the SAM flags that correspond to the FR and RF configurations.

+ +

For detailed explanations of library construction strategies (for Illumina sequencers) and how read orientations are determined, please see:

+ \ No newline at end of file diff --git a/doc_archive/dictionary/Parallelism.md b/doc_archive/dictionary/Parallelism.md new file mode 100644 index 000000000..4a0fdf06f --- /dev/null +++ b/doc_archive/dictionary/Parallelism.md @@ -0,0 +1,86 @@ +## Parallelism + +http://gatkforums.broadinstitute.org/gatk/discussion/1988/parallelism + +

This document explains the concepts involved and how they are applied within the GATK (and Crom+WDL or Queue where applicable). For specific configuration recommendations, see the companion document on parallelizing GATK tools.

+
+

1. The concept of parallelism

+

Parallelism is a way to make a program finish faster by performing several operations in parallel, rather than sequentially (i.e. waiting for each operation to finish before starting the next one).

+

Imagine you need to cook rice for sixty-four people, but your rice cooker can only make enough rice for four people at a time. If you have to cook all the batches of rice sequentially, it's going to take all night. But if you have eight rice cookers that you can use in parallel, you can finish up to eight times faster.

+

This is a very simple idea but it has a key requirement: you have to be able to break down the job into smaller tasks that can be done independently. It's easy enough to divide portions of rice because rice itself is a collection of discrete units. In contrast, let's look at a case where you can't make that kind of division: it takes one pregnant woman nine months to grow a baby, but you can't do it in one month by having nine women share the work.

+

The good news is that most GATK runs are more like rice than like babies. Because GATK tools are built to use the Map/Reduce method (see doc for details), most GATK runs essentially consist of a series of many small independent operations that can be parallelized.

+

A quick warning about tradeoffs

+

Parallelism is a great way to speed up processing on large amounts of data, but it has "overhead" costs. Without getting too technical at this point, let's just say that parallelized jobs need to be managed, you have to set aside memory for them, regulate file access, collect results and so on. So it's important to balance the costs against the benefits, and avoid dividing the overall work into too many small jobs.

+

Going back to the introductory example, you wouldn't want to use a million tiny rice cookers that each boil a single grain of rice. They would take way too much space on your countertop, and the time it would take to distribute each grain then collect it when it's cooked would negate any benefits from parallelizing in the first place.

+

Parallel computing in practice (sort of)

+

OK, parallelism sounds great (despite the tradeoffs caveat), but how do we get from cooking rice to executing programs? What actually happens in the computer?

+

Consider that when you run a program like the GATK, you're just telling the computer to execute a set of instructions.

+

Let's say we have a text file and we want to count the number of lines in it. The set of instructions to do this can be as simple as:

+ +

Note that tell us the number can mean writing it to the console, or storing it somewhere for use later on.

+

Now let's say we want to know the number of words on each line. The set of instructions would be:

+ +

And so on until we've read all the lines, and finally we can close the file. It's pretty straightforward, but if our file has a lot of lines, it will take a long time, and it will probably not use all the computing power we have available.

+

So to parallelize this program and save time, we just cut up this set of instructions into separate subsets like this:

+ +

Here, the read the Nth line steps can be performed in parallel, because they are all independent operations.

+

You'll notice that we added a step, index the lines. That's a little bit of peliminary work that allows us to perform the read the Nth line steps in parallel (or in any order we want) because it tells us how many lines there are and where to find each one within the file. It makes the whole process much more efficient. As you may know, the GATK requires index files for the main data files (reference, BAMs and VCFs); the reason is essentially to have that indexing step already done.

+

Anyway, that's the general principle: you transform your linear set of instructions into several subsets of instructions. There's usually one subset that has to be run first and one that has to be run last, but all the subsets in the middle can be run at the same time (in parallel) or in whatever order you want.

+
+

2. Parallelizing the GATK

+

There are three different modes of parallelism offered by the GATK, and to really understand the difference you first need to understand what are the different levels of computing that are involved.

+

A quick word about levels of computing

+

By levels of computing, we mean the computing units in terms of hardware: the core, the machine (or CPU) and the cluster or cloud.

+ +

Parallelism can be applied at all three of these levels, but in different ways of course, and under different names. Parallelism takes the name of multi-threading at the core and machine levels, and scatter-gather at the cluster level.

+

Multi-threading

+

In computing, a thread of execution is a set of instructions that the program issues to the processor to get work done. In single-threading mode, a program only sends a single thread at a time to the processor and waits for it to be finished before sending another one. In multi-threading mode, the program may send several threads to the processor at the same time.

+ +

Not making sense? Let's go back to our earlier example, in which we wanted to count the number of words in each line of our text document. Hopefully it is clear that the first version of our little program (one long set of sequential instructions) is what you would run in single-threaded mode. And the second version (several subsets of instructions) is what you would run in multi-threaded mode, with each subset forming a separate thread. You would send out the first thread, which performs the preliminary work; then once it's done you would send the "middle" threads, which can be run in parallel; then finally once they're all done you would send out the final thread to clean up and collect final results.

+

If you're still having a hard time visualizing what the different threads are like, just imagine that you're doing cross-stitching. If you're a regular human, you're working with just one hand. You're pulling a needle and thread (a single thread!) through the canvas, making one stitch after another, one row after another. Now try to imagine an octopus doing cross-stitching. He can make several rows of stitches at the same time using a different needle and thread for each. Multi-threading in computers is surprisingly similar to that.

+

Hey, if you have a better example, let us know in the forum and we'll use that instead.

+

Alright, now that you understand the idea of multithreading, let's get practical: how do we do get the GATK to use multi-threading?

+

There are two options for multi-threading with the GATK, controlled by the arguments -nt and -nct, respectively. They can be combined, since they act at different levels of computing:

+ +

Not all GATK tools can use these options due to the nature of the analyses that they perform and how they traverse the data. Even in the case of tools that are used sequentially to perform a multi-step process, the individual tools may not support the same options. For example, at time of writing (Dec. 2012), of the tools involved in local realignment around indels, RealignerTargetCreator supports -nt but not -nct, while IndelRealigner does not support either of these options.

+

In addition, there are some important technical details that affect how these options can be used with optimal results. Those are explained along with specific recommendations for the main GATK tools in a companion document on parallelizing the GATK.

+

Scatter-gather

+

If you Google it, you'll find that the term scatter-gather can refer to a lot of different things, including strategies to get the best price quotes from online vendors, methods to control memory allocation and… an indie-rock band. What all of those things have in common (except possibly the band) is that they involve breaking up a task into smaller, parallelized tasks (scattering) then collecting and integrating the results (gathering). That should sound really familiar to you by now, since it's the general principle of parallel computing.

+

So yes, "scatter-gather" is really just another way to say we're parallelizing things. OK, but how is it different from multithreading, and why do we need yet another name?

+

As you know by now, multithreading specifically refers to what happens internally when the program (in our case, the GATK) sends several sets of instructions to the processor to achieve the instructions that you originally gave it in a single command-line. In contrast, the scatter-gather strategy as used by the GATK involves separate programs. There are two pipelining solutions that we support for scatter-gathering GATK jobs, Crom+WDL and Queue. They are quite different, but both are able to generate separate GATK jobs (each with its own command-line) to achieve the instructions given in a script.

+ +

At the simplest level, the script can involve a single GATK tool*. In that case, the execution engine (Cromwell or Queue) will create separate GATK commands that will each run that tool on a portion of the input data (= the scatter step). The results of each run will be stored in temporary files. Then once all the runs are done, the engine will collate all the results into the final output files, as if the tool had been run as a single command (= the gather step).

+

Note that Queue and Cromwell have additional capabilities, such as managing the use of multiple GATK tools in a dependency-aware manner to run complex pipelines, but that is outside the scope of this article. To learn more about pipelining the GATK with Queue, please see the Queue documentation. To learn more about Crom+WDL, see the WDL website.

+

Compare and combine

+

So you see, scatter-gather is a very different process from multi-threading because the parallelization happens outside of the program itself. The big advantage is that this opens up the upper level of computing: the cluster level. Remember, the GATK program is limited to dispatching threads to the processor of the machine on which it is run – it cannot by itself send threads to a different machine. But an execution engine like Queue or Cromwell can dispatch scattered GATK jobs to different machines in a computing cluster or on a cloud platform by interfacing with the appropriate job management software.

+

That being said, multithreading has the great advantage that cores and machines all have access to shared machine memory with very high bandwidth capacity. In contrast, the multiple machines on a network used for scatter-gather are fundamentally limited by network costs.

+

The good news is that you can combine scatter-gather and multithreading: use Queue or Cromwell to scatter GATK jobs to different nodes on your cluster or cloud platform, then use the GATK's internal multithreading capabilities to parallelize the jobs running on each node.

+

Going back to the rice-cooking example, it's as if instead of cooking the rice yourself, you hired a catering company to do it for you. The company assigns the work to several people, who each have their own cooking station with multiple rice cookers. Now you can feed a lot more people in the same amount of time! And you don't even have to clean the dishes.

\ No newline at end of file diff --git a/doc_archive/dictionary/Pedigree___PED_files.md b/doc_archive/dictionary/Pedigree___PED_files.md new file mode 100644 index 000000000..b9d850c3d --- /dev/null +++ b/doc_archive/dictionary/Pedigree___PED_files.md @@ -0,0 +1,37 @@ +## Pedigree / PED files + +http://gatkforums.broadinstitute.org/gatk/discussion/7696/pedigree-ped-files + +

A pedigree is a structured description of the familial relationships between samples.

+

Some GATK tools are capable of incorporating pedigree information in the analysis they perform if provided in the form of a PED file through the --pedigree (or -ped) argument.

+
+

PED file format

+

PED files are tabular text files describing meta-data about the samples. See http://www.broadinstitute.org/mpg/tagger/faq.html and http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped for more information.

+

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ +

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. If an individual's sex is unknown, then any character other than 1 or 2 can be used in the fifth column.

+

A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a quantitative trait or an "affected status" column: GATK will automatically detect which type (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+

Affected status should be coded as follows:

+ +

If any value outside of -9,0,1,2 is detected, then the samples are assumed to have phenotype values, interpreted as string phenotype values.

+

Note that genotypes (column 7 onwards) cannot be specified to the GATK.

+

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that line will be ignored, so make sure none of the IDs start with this character.

+

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to tell the GATK PED parser that the corresponding fields are missing from the ped file.

+

Example

+

Here are two individuals (one row = one person):

+
+FAM001  1  0 0  1  2
+FAM001  2  0 0  1  2
+
\ No newline at end of file diff --git a/doc_archive/dictionary/Phred-scaled_Quality_Scores.md b/doc_archive/dictionary/Phred-scaled_Quality_Scores.md new file mode 100644 index 000000000..78b0038a3 --- /dev/null +++ b/doc_archive/dictionary/Phred-scaled_Quality_Scores.md @@ -0,0 +1,69 @@ +## Phred-scaled Quality Scores + +http://gatkforums.broadinstitute.org/gatk/discussion/4260/phred-scaled-quality-scores + +

You may have noticed that a lot of the scores that are output by the GATK are in Phred scale. The Phred scale was originally used to represent base quality scores emitted by the Phred program in the early days of the Human Genome Project (see this Wikipedia article for more historical background). Now they are widely used to represent probabilities and confidence scores in other contexts of genome science.

+

Phred scale in context

+

In the context of sequencing, Phred-scaled quality scores are used to represent how confident we are in the assignment of each base call by the sequencer.

+

In the context of variant calling, Phred-scaled quality scores can be used to represent many types of probabilities. The most commonly used in GATK is the QUAL score, or variant quality score. It is used in much the same way as the base quality score: the variant quality score is a Phred-scaled estimate of how confident we are that the variant caller correctly identified that a given genome position displays variation in at least one sample.

+

Phred scale in practice

+

In today’s sequencing output, by convention, most useable Phred-scaled base quality scores range from 2 to 40, with some variations in the range depending on the origin of the sequence data (see the FASTQ format documentation for details). However, Phred-scaled quality scores in general can range anywhere from 0 to infinity. A higher score indicates a higher probability that a particular decision is correct, while conversely, a lower score indicates a higher probability that the decision is incorrect.

+

The Phred quality score (Q) is logarithmically related to the error probability (E).

+

$$ Q = -10 \log E $$

+

So we can interpret this score as an estimate of error, where the error is e.g. the probability that the base is called incorrectly by the sequencer, but we can also interpret it as an estimate of accuracy, where the accuracy is e.g. the probability that the base was identified correctly by the sequencer. Depending on how we decide to express it, we can make the following calculations:

+

If we want the probability of error (E), we take:

+

$$ E = 10 ^{-\left(\frac{Q}{10}\right)} $$

+

And conversely, if we want to express this as the estimate of accuracy (A), we simply take

+

$$ +\begin{eqnarray} +A &=& 1 - E \nonumber \ +&=& 1 - 10 ^{-\left(\frac{Q}{10}\right)} \nonumber \ +\end{eqnarray} +$$

+

Here is a table of how to interpret a range of Phred Quality Scores. It is largely adapted from the Wikipedia page for Phred Quality Score.

+

For many purposes, a Phred Score of 20 or above is acceptable, because this means that whatever it qualifies is 99% accurate, with a 1% chance of error.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Phred Quality ScoreErrorAccuracy (1 - Error)
101/10 = 10%90%
201/100 = 1%99%
301/1000 = 0.1%99.9%
401/10000 = 0.01%99.99%
501/100000 = 0.001%99.999%
601/1000000 = 0.0001%99.9999%
+

And finally, here is a graphical representation of the Phred scores showing their relationship to accuracy and error probabilities.

+ +

The red line shows the error, and the blue line shows the accuracy. Of course, as error decreases, accuracy increases symmetrically.

+

Note: You can see that below Q20 (which is how we usually refer to a Phred score of 20), the curve is really steep, meaning that as the Phred score decreases, you lose confidence very rapidly. In contrast, above Q20, both of the graphs level out. This is why Q20 is a good cutoff score for many basic purposes.

\ No newline at end of file diff --git a/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md b/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md new file mode 100644 index 000000000..95c657ee8 --- /dev/null +++ b/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md @@ -0,0 +1,6 @@ +## Pre-adapter artifacts (in hybrid selection) + +http://gatkforums.broadinstitute.org/gatk/discussion/6332/pre-adapter-artifacts-in-hybrid-selection + +

Various sources of error affect the hybrid selection (HS) process. Pre-adapter artifacts are those that arise in the preparation step(s) prior to the ligation of the PCR adapters. These artifacts occur on the original template strand, before the addition of adapters, so they correlate with read number orientation in a specific way.

+

A classic example is the shearing of target genomic DNA leading to oxidation of an amine of guanine at position 8 8-oxoguanine (8-OxoG, OxoG) (doi:10.1093/nar/gks1443) (see also OxoG entry in this dictionary).

\ No newline at end of file diff --git a/doc_archive/dictionary/Read_groups.md b/doc_archive/dictionary/Read_groups.md new file mode 100644 index 000000000..16d4f6143 --- /dev/null +++ b/doc_archive/dictionary/Read_groups.md @@ -0,0 +1,65 @@ +## Read groups + +http://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups + +

There is no formal definition of what is a read group, but in practice, this term refers to a set of reads that were generated from a single run of a sequencing instrument.

+

In the simple case where a single library preparation derived from a single biological sample was run on a single lane of a flowcell, all the reads from that lane run belong to the same read group. When multiplexing is involved, then each subset of reads originating from a separate library run on that lane will constitute a separate read group.

+

Read groups are identified in the SAM/BAM /CRAM file by a number of tags that are defined in the official SAM specification. These tags, when assigned appropriately, allow us to differentiate not only samples, but also various technical features that are associated with artifacts. With this information in hand, we can mitigate the effects of those artifacts during the duplicate marking and base recalibration steps. The GATK requires several read group fields to be present in input files and will fail with errors if this requirement is not satisfied. See this article for common problems related to read groups.

+

To see the read group information for a BAM file, use the following command.

+
samtools view -H sample.bam | grep '@RG'
+

This prints the lines starting with @RG within the header, e.g. as shown in the example below.

+
@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI
+
+

Meaning of the read group fields required by GATK

+ +

If your sample collection's BAM files lack required fields or do not differentiate pertinent factors within the fields, use Picard's AddOrReplaceReadGroups to add or appropriately rename the read group fields as outlined here.

+
+

Deriving ID and PU fields from read names

+

Here we illustrate how to derive both ID and PU fields from read names as they are formed in the data produced by the Broad Genomic Services pipelines (other sequence providers may use different naming conventions). We break down the common portion of two different read names from a sample file. The unique portion of the read names that come after flow cell lane, and separated by colons, are tile number, x-coordinate of cluster and y-coordinate of cluster.

+
H0164ALXX140820:2:1101:10003:23460
+H0164ALXX140820:2:1101:15118:25288
+

Breaking down the common portion of the query names:

+
H0164____________ #portion of @RG ID and PU fields indicating Illumina flow cell
+_____ALXX140820__ #portion of @RG PU field indicating barcode or index in a multiplexed run
+_______________:2 #portion of @RG ID and PU fields indicating flow cell lane
+
+

Multi-sample and multiplexed example

+

Suppose I have a trio of samples: MOM, DAD, and KID. Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts. Each of these libraries is run on two lanes of an Illumina HiSeq, requiring 3 x 2 x 2 = 12 lanes of data. When the data come off the sequencer, I would create 12 bam files, with the following @RG fields in the header:

+
Dad's data:
+@RG     ID:FLOWCELL1.LANE1      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
+@RG     ID:FLOWCELL1.LANE2      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
+@RG     ID:FLOWCELL1.LANE3      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400
+@RG     ID:FLOWCELL1.LANE4      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400
+
+Mom's data:
+@RG     ID:FLOWCELL1.LANE5      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
+@RG     ID:FLOWCELL1.LANE6      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
+@RG     ID:FLOWCELL1.LANE7      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400
+@RG     ID:FLOWCELL1.LANE8      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400
+
+Kid's data:
+@RG     ID:FLOWCELL2.LANE1      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
+@RG     ID:FLOWCELL2.LANE2      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
+@RG     ID:FLOWCELL2.LANE3      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400
+@RG     ID:FLOWCELL2.LANE4      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400
+

Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).

\ No newline at end of file diff --git a/doc_archive/dictionary/Reference_Genome_Components.md b/doc_archive/dictionary/Reference_Genome_Components.md new file mode 100644 index 000000000..185872fdb --- /dev/null +++ b/doc_archive/dictionary/Reference_Genome_Components.md @@ -0,0 +1,79 @@ +## Reference Genome Components + +http://gatkforums.broadinstitute.org/gatk/discussion/7857/reference-genome-components + +

Document is in BETA. It may be incomplete and/or inaccurate. Post suggestions to the Comments section.

+
+

This document defines several components of a reference genome. We use the human GRCh38/hg38 assembly to illustrate.

+

GRCh38/hg38 is the assembly of the human genome released December of 2013, that uses alternate or ALT contigs to represent common complex variation, including HLA loci. Alternate contigs are also present in past assemblies but not to the extent we see with GRCh38. Much of the improvements in GRCh38 are the result of other genome sequencing and analysis projects, including the 1000 Genomes Project.

+

The ideogram is from the Genome Reference Consortium website and showcases GRCh38.p7. The zoomed region illustrates how regions in blue are full of Ns.

+

Analysis set reference genomes have special features to accommodate sequence read alignment. This type of genome reference can differ from the reference you use to browse the genome.

+ +
+

Nomenclature: words to describe components of reference genomes

+ +
+

The GATK perspective on reference genomes

+

Within GATK documentation, Tutorial#8017 outlines how to map reads in an alternate contig aware manner and discusses some of the implications of mapping reads to reference genomes with alternate contigs.

+

GATK tools allow for use of a genomic intervals list that tells tools which regions of the genome the tools should act on. Judicious use of an intervals list, e.g. one that excludes regions of Ns and low complexity repeat regions in the genome, makes processes more efficient. This brings us to the next point.

+

Specifying contigs with colons in their names, as occurs for new contigs in GRCh38, requires special handling for GATK versions prior to v3.6. Please use the following workaround.

+ +

Viewing CRAM alignments on genome browsers

+

Because CRAM compression depends on the alignment reference genome, tools that use CRAM files ensure correct decompression by comparing reference contig MD5 hashtag values. These are sensitive to any changes in the sequence, e.g. masking with Ns. This can have implications for viewing alignments in genome browsers when there is a disjoint between the reference that is loaded in the browser and the reference that was used in alignment. If you are using a version of tools for which this is an issue, be sure to load the original analysis set reference genome to view the CRAM alignments.

+

Should I switch to a newer reference?

+

Yes you should. In addition to adding many alternate contigs, GRCh38 corrects thousands of SNPs and indels in the GRCh37 assembly that are absent in the population and are likely sequencing artifacts. It also includes synthetic centromeric sequence and updates non-nuclear genomic sequence.

+

The ability to recognize alternate haplotypes for loci is a drastic improvement that GRCh38 makes possible. Going forward, expanding genomics data will help identify variants for alternate haplotypes, improve existing and add additional alternate haplotypes and give us a better accounting of alternate haplotypes within populations. We are already seeing improvements and additions in the patch releases to reference genomes, e.g. the seven minor releases of GRCh38 available at the time of this writing.

+

Note that variants produced by alternate haplotypes when they are represented on the primary assembly may or may not be present in data resources, e.g. dbSNP. This could have varying degrees of impact, including negligible, for any process that relies on known variant sites. Consider the impact this discrepant coverage in data resources may have for your research aims and weigh this against the impact of missing variants because their sequence context is unaccounted for in previous assemblies.

+
+

External resources

+
    +
  1. New 11/16/2016 For a brief history and discussion on challenges in using GRCh38, see the 2015 Genome Biology article Extending reference assembly models by Church et al. (DOI: 10.1186/s13059-015-0587-3).
  2. +
  3. For press releases highlighting improvements in GRCh38 from December 2013, see http://www.ncbi.nlm.nih.gov/news/12-23-2013-grch38-released/ and http://genomeref.blogspot.co.uk/2013/12/announcing-grch38.html. The latter post summarizes major improvements, including the correction of thousands of SNPs and indels in GRCh37 not seen in the population and the inclusion of synthetic centromeric sequence.
  4. +
  5. Recent releases of BWA, e.g. v0.7.15+, handle alt contig mapping and HLA typing. See the BWA repository for information. See these pages for download and installation instructions.
  6. +
  7. The Genome Reference Consortium (GRC) provides human, mouse, zebrafish and chicken sequences, and this particular webpage gives an overview of GRCh38. Namely, an interactive chromosome ideogram marks regions with corresponding alternate loci, regions with fix patches and regions containing novel patches. For additional assembly terminology, see http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml.
  8. +
  9. +

    The UCSC Genome Browser allows browsing and download of genomes, including analysis sets, from many different species. For more details on the difference between GRCh38 reference and analysis sets, see ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/README.txt and ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/README.txt, respectively. In addition, the site provides annotation files, e.g. here is the annotation database for GRCh38. Within this particular page, the file named gap.txt.gz catalogues the gapped regions of the assembly full of Ns. For our illustration above, the corresponding region in this file shows:

    +
        585    chr14    0    10000    1    N    10000    telomere    no
    +    1    chr14    10000    16000000    2    N    15990000    short_arm    no
    +    707    chr14    16022537    16022637    4    N    100    contig    no
    +
  10. +
  11. The Integrative Genomics Viewer is a desktop application for viewing genomics data including alignments. The tool accesses reference genomes you provide via file or URL or that it hosts over a server. The numerous hosted reference genomes include GRCh38. See this page for information on hosted reference genomes. For the most up-to-date list of hosted genomes, open IGV and go to Genomes>Load Genome From Server. A menu lists genomes you can make available in the main genome dropdown menu.
  12. +
+
\ No newline at end of file diff --git a/doc_archive/dictionary/Spanning_or_overlapping_deletions.md b/doc_archive/dictionary/Spanning_or_overlapping_deletions.md new file mode 100644 index 000000000..a3784b8b5 --- /dev/null +++ b/doc_archive/dictionary/Spanning_or_overlapping_deletions.md @@ -0,0 +1,15 @@ +## Spanning or overlapping deletions + +http://gatkforums.broadinstitute.org/gatk/discussion/6926/spanning-or-overlapping-deletions + +

We use the term spanning deletion or overlapping deletion to refer to a deletion that spans a position of interest.

+

The presence of a spanning deletion affects how we can represent genotypes at any site(s) that it spans for those samples that carry the deletion, whether in heterozygous or homozygous variant form. Page 8, item 5 of the VCF v4.3 specification reserves the * allele to reference overlapping deletions. This is not to be confused with the bracketed asterisk <*> used to denote symbolic alternate alleles.

+
+ +

Here we illustrate with four human samples. Bob and Lian each have a heterozygous A to T single polymorphism at position 20, our position of interest. Kyra has a 9 bp deletion from position 15 to 23 on both homologous chromosomes that extends across position 20. Lian and Omar each are heterozygous for the same 9 bp deletion. Omar and Bob's other allele is the reference A.

+

What are the genotypes for each individual at position 20? For Bob, the reference A and variant T alleles are clearly present for a genotype of A/T.

+

What about Lian? Lian has a variant T allele plus a 9 bp deletion overlapping position 20. To notate the deletion as we do single nucleotide deletions is technically inaccurate. We need a placeholder notation to signify absent sequence that extends beyond the position of interest and that is listed for an earlier position, in our case position 14. The solution is to use a star or asterisk * at position 20 to refer to the spanning deletion. Using this convention, Lian's genotype is T/*.

+

At the sample-level, Kyra and Omar would not have records for position 20. However, we are comparing multiple samples and so we indicate the spanning deletion at position 20 with *. Omar's genotype is A/* and Kyra's is */*.

+
+ +

In the VCF, depending on the format used by tools, positions equivalent to our example position 20 may or may not be listed. If listed, such as in the first example VCF shown, the spanning deletion is noted with the asterisk * under the ALT column. The spanning deletion is then referred to in the genotype GT for Kyra, Lian and Omar. Alternatively, a VCF may altogether avoid referencing the spanning deletion by listing the variant with the spanning deletion together with the deletion. This is shown in the second example VCF at position 14.

\ No newline at end of file diff --git a/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md b/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md new file mode 100644 index 000000000..d294fce31 --- /dev/null +++ b/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md @@ -0,0 +1,12 @@ +## At what point should I merge read group BAM files belonging to the same sample into a single file? + +http://gatkforums.broadinstitute.org/gatk/discussion/6057/at-what-point-should-i-merge-read-group-bam-files-belonging-to-the-same-sample-into-a-single-file + +

It is fairly common to have multiple read groups for a sample, either from sequencing multiple libraries or from spreading a library across multiple lanes. It seems this causes a lot of confusion, and people often tell us they're not sure how to organize the data for the pre-processing steps or how to feed the data into HaplotypeCaller.

+

Well, there are several options for organizing the processing. We have a fairly detailed FAQ article that describes our preferred workflow for pre-processing data from multiplexed sequencing and multi-library designs. But in this article we describe at a simpler level what are the main two options depending on how you want to provide the analysis ready BAM files to the variant caller.

+

To produce a combined per-sample bam file to feed to HaplotypeCaller (most common)

+

The simplest thing to do is to input all the bam files that belong to that sample, either at the MarkDuplicates step, the Indel Realignment step or at the BQSR step. The choice depends mostly on how deep the coverage is. High depth means a lot of data to process at the same time, which slows down Indel Realignment. This is because Indel Realignment ignores all read group information and simply processes all reads together. BQSR doesn't suffer from that problem because it processes read groups separately. In either case, when you input all samples together, the bam that gets written out with the processed data will include all the libraries / read groups in one handy per-sample file.

+

Note: We do not require the PU field in the RG, however, BQSR will consider the PU field over all other fields.

+

To produce a separate bam file for each read group (less common)

+

Another option is to keep all the bam files separate until variant calling, and then input them to Haplotype Caller together. You can do this by simply running Indel Realignment and BQSR on each of the bams separately. You can then input all of the bams into HaplotypeCaller at once. This works even if you want to run HaplotypeCaller in GVCF mode, which can only be done on a single sample at a time. As long as the SM tags are identical, HaplotypeCaller will recognize that it's a single-sample run. This is because the GATK engine will merge the data before presenting it to the HaplotypeCaller tool, so HaplotypeCaller does not know nor care whether the data came from many files or one file.

+

Note: If you input many bam files into Indel Realigner, the default output is one bam file. However, you can output one bam file for each input bam file by using -nWayOut.

\ No newline at end of file diff --git a/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md b/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md new file mode 100644 index 000000000..e8f9d0935 --- /dev/null +++ b/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md @@ -0,0 +1,8 @@ +## Can I apply the germline variant joint calling workflow to my RNAseq data? + +http://gatkforums.broadinstitute.org/gatk/discussion/7363/can-i-apply-the-germline-variant-joint-calling-workflow-to-my-rnaseq-data + +

We have not yet validated the joint genotyping methods (HaplotypeCaller in -ERC GVCF mode per-sample then GenotypeGVCFs per-cohort) on RNAseq data. Our standard recommendation is to process RNAseq samples individually as laid out in the RNAseq-specific documentation.

+

However, we know that a lot of people have been trying out the joint genotyping workflow on RNAseq data, and there do not seem to be any major technical problems. You are welcome to try it on your own data, with the caveat that we cannot guarantee correctness of results, and may not be able to help you if something goes wrong. Please be sure to examine your results carefully and critically.

+

If you do pursue this, you will need to pre-process your samples according to our RNA-specific documentation, then switch to the GVCF workflow at the HaplotypeCaller stage. For filtering, it will be up to you to determine whether the hard filtering or VQSR filtering method produce best results. We have not tested any of this so we cannot provide a recommendation. Be prepared to do a lot of analysis to validate the quality of your results.

+

Good luck!

\ No newline at end of file diff --git a/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md b/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md new file mode 100644 index 000000000..948e95d78 --- /dev/null +++ b/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md @@ -0,0 +1,19 @@ +## Can I use GATK on non-diploid organisms? + +http://gatkforums.broadinstitute.org/gatk/discussion/1214/can-i-use-gatk-on-non-diploid-organisms + + +

In general most GATK tools don't care about ploidy. The major exception is, of course, at the variant calling step: the variant callers need to know what ploidy is assumed for a given sample in order to perform the appropriate calculations.

+

Ploidy-related functionalities

+

As of version 3.3, the HaplotypeCaller and GenotypeGVCFs are able to deal with non-diploid organisms (whether haploid or exotically polyploid). In the case of HaplotypeCaller, you need to specify the ploidy of your non-diploid sample with the -ploidy argument. HC can only deal with one ploidy at a time, so if you want to process different chromosomes with different ploidies (e.g. to call X and Y in males) you need to run them separately. On the bright side, you can combine the resulting files afterward. In particular, if you’re running the -ERC GVCF workflow, you’ll find that both CombineGVCFs and GenotypeGVCFs are able to handle mixed ploidies (between locations and between samples). Both tools are able to correctly work out the ploidy of any given sample at a given site based on the composition of the GT field, so they don’t require you to specify the -ploidy argument.

+

For earlier versions (all the way to 2.0) the fallback option is UnifiedGenotyper, which also accepts the -ploidy argument.

+

Cases where ploidy needs to be specified

+
    +
  1. Native variant calling in haploid or polyploid organisms.
  2. +
  3. Pooled calling where many pooled organisms share a single barcode and hence are treated as a single "sample".
  4. +
  5. Pooled validation/genotyping at known sites.
  6. +
+

For normal organism ploidy, you just set the -ploidy argument to the desired number of chromosomes per organism. In the case of pooled sequencing experiments, this argument should be set to the number of chromosomes per barcoded sample, i.e. (Ploidy per individual) * (Individuals in pool).

+

Important limitations

+

Several variant annotations are not appropriate for use with non-diploid cases. In particular, InbreedingCoeff will not be annotated on non-diploid calls. Annotations that do work and are supported in non-diploid use cases are the following: QUAL, QD, SB, FS, AC, AF, and Genotype annotations such as PL, AD, GT, etc.

+

You should also be aware of the fundamental accuracy limitations of high ploidy calling. Calling low-frequency variants in a pool or in an organism with high ploidy is hard because these rare variants become almost indistinguishable from sequencing errors.

\ No newline at end of file diff --git a/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md b/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md new file mode 100644 index 000000000..092b8bd62 --- /dev/null +++ b/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md @@ -0,0 +1,18 @@ +## Can I use different versions of the GATK at different steps of my analysis? + +http://gatkforums.broadinstitute.org/gatk/discussion/3536/can-i-use-different-versions-of-the-gatk-at-different-steps-of-my-analysis + +

Short answer: NO.

+

Medium answer: no, at least not if you want to run a low-risk pipeline.

+

Long answer: see below for details.

+
+

The rationale

+

There are several reasons why you might want to do this: you're using the latest version of GATK and one of the tools has a show-stopping bug, so you'd like to use an older, pre-bug version of that tool, but still use the latest version of all the other tools; or maybe you've been using an older version of GATK and you'd like to use a new tool, but keep using the rest in the version that you've been using to process hundreds of samples already.

+

The problem: compatibility is not guaranteed

+

In many cases, when we modify one tool in the GATK, we need to make adjustments to other tools that interact either directly or indirectly with the data consumed or produced by the upgraded tool. If you mix and match tools from different versions of GATK, you risk running into compatibility issues. For example, HaplotypeCaller expects a BAM compressed by Reduce Reads to have its data annotated in a certain way. If the information is formatted differently than what the HC expects (because that's how the corresponding RR from the same version does it), it can blow up -- or worse, do the wrong thing but not tell you there's a problem.

+

But what if the tools/tasks are in unrelated workflows?

+

Would it really be so bad to use CountReads from GATK version 2.7 for a quick QC check that's not actually part of my pipeline, which uses version 2.5? Well, maaaaybe not, but we still think it's a source of error, and we do our damnedest to eliminate those.

+

The conclusion

+

You shouldn't use tools from different versions within the same workflow, that's for sure. We don't think it's worth the risks. If there's a show-stopping bug, let us know and we promise to fix it as soon as (humanly) possible. For the rest, either accept that you're stuck with the version you started your study with (we may be able to help with workarounds for known issues), or upgrade your entire workflow and start your analysis from scratch. Depending on how far along you are one of those options will be less painful to you; go with that.

+

The plea bargain, and a warning

+

If despite our dire warnings you're still going to mix and match tool versions, fine, we can't stop you. But be really careful, and check every version release notes document ever. And keep in mind that when things go wrong, we will deny you support if we think you've been reckless.

\ No newline at end of file diff --git a/doc_archive/faqs/Collected_FAQs_about_VCF_files.md b/doc_archive/faqs/Collected_FAQs_about_VCF_files.md new file mode 100644 index 000000000..5d5587892 --- /dev/null +++ b/doc_archive/faqs/Collected_FAQs_about_VCF_files.md @@ -0,0 +1,10 @@ +## Collected FAQs about VCF files + +http://gatkforums.broadinstitute.org/gatk/discussion/1318/collected-faqs-about-vcf-files + +

1. What file formats do you support for variant callsets?

+

We support the Variant Call Format (VCF) for variant callsets. No other file formats are supported.

+

2. How can I know if my VCF file is valid?

+

VCFTools contains a validation tool that will allow you to verify it.

+

3. Are you planning to include any converters from different formats or allow different input formats than VCF?

+

No, we like VCF and we think it's important to have a good standard format. Multiplying formats just makes life hard for everyone, both developers and analysts.

\ No newline at end of file diff --git a/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md b/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md new file mode 100644 index 000000000..c8463bc3a --- /dev/null +++ b/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md @@ -0,0 +1,90 @@ +## Collected FAQs about input files for sequence read data (BAM/CRAM) + +http://gatkforums.broadinstitute.org/gatk/discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram + +

1. What file formats do you support for sequence data input?

+

The GATK supports the BAM format for reads, quality scores, alignments, and metadata (e.g. the lane of sequencing, center of origin, sample name, etc.). Starting with version 3.5, the CRAM format is supported as well. SAM format is not supported but can be easily converted with Picard tools.

+
+

2. How do I get my data into BAM format?

+

The GATK doesn't have any tools for getting data into BAM format, but many other toolkits exist for this purpose. We recommend you look at Picard and Samtools for creating and manipulating BAM files. Also, many aligners are starting to emit BAM files directly. See BWA for one such aligner.

+
+

3. What are the formatting requirements for my BAM file(s)?

+

All BAM/CRAM files must satisfy the following requirements:

+ +

See the official BAM specification for more information on what constitutes a valid BAM file.

+
+

4. What is the canonical ordering of human reference contigs in a BAM file?

+

It depends on whether you're using the NCBI/GRC build 36/build 37 version of the human genome, or the UCSC hg18/hg19 version of the human genome. While substantially equivalent, the naming conventions are different. The canonical ordering of contigs for these genomes is as follows:

+

Human genome reference consortium standard ordering and names (b3x): +1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT...

+

UCSC convention (hg1x): +chrM, chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY...

+
+

5. How can I tell if my BAM file is sorted properly?

+

The easiest way to do it is to download Samtools and run the following command to examine the header of your file:

+
$ samtools view -H /path/to/my.bam
+@HD     VN:1.0  GO:none SO:coordinate
+@SQ     SN:1    LN:247249719
+@SQ     SN:2    LN:242951149
+@SQ     SN:3    LN:199501827
+@SQ     SN:4    LN:191273063
+@SQ     SN:5    LN:180857866
+@SQ     SN:6    LN:170899992
+@SQ     SN:7    LN:158821424
+@SQ     SN:8    LN:146274826
+@SQ     SN:9    LN:140273252
+@SQ     SN:10   LN:135374737
+@SQ     SN:11   LN:134452384
+@SQ     SN:12   LN:132349534
+@SQ     SN:13   LN:114142980
+@SQ     SN:14   LN:106368585
+@SQ     SN:15   LN:100338915
+@SQ     SN:16   LN:88827254
+@SQ     SN:17   LN:78774742
+@SQ     SN:18   LN:76117153
+@SQ     SN:19   LN:63811651
+@SQ     SN:20   LN:62435964
+@SQ     SN:21   LN:46944323
+@SQ     SN:22   LN:49691432
+@SQ     SN:X    LN:154913754
+@SQ     SN:Y    LN:57772954
+@SQ     SN:MT   LN:16571
+@SQ     SN:NT_113887    LN:3994
+...
+

If the order of the contigs here matches the contig ordering specified above, and the SO:coordinate flag appears in your header, then your contig and read ordering satisfies the GATK requirements.

+
+

6. My BAM file isn't sorted that way. How can I fix it?

+

Picard offers a tool called SortSam that will sort a BAM file properly. A similar utility exists in Samtools, but we recommend the Picard tool because SortSam will also set a flag in the header that specifies that the file is correctly sorted, and this flag is necessary for the GATK to know it is safe to process the data. Also, you can use the ReorderSam command to make a BAM file SQ order match another reference sequence.

+
+

7. How can I tell if my BAM file has read group and sample information?

+

A quick Unix command using Samtools will do the trick:

+
$ samtools view -H /path/to/my.bam | grep '^@RG'
+@RG ID:0    PL:solid    PU:Solid0044_20080829_1_Pilot1_Ceph_12414_B_lib_1_2Kb_MP_Pilot1_Ceph_12414_B_lib_1_2Kb_MP   LB:Lib1 PI:2750 DT:2008-08-28T20:00:00-0400 SM:NA12414  CN:bcm
+@RG ID:1    PL:solid    PU:0083_BCM_20080719_1_Pilot1_Ceph_12414_B_lib_1_2Kb_MP_Pilot1_Ceph_12414_B_lib_1_2Kb_MP    LB:Lib1 PI:2750 DT:2008-07-18T20:00:00-0400 SM:NA12414  CN:bcm
+@RG ID:2    PL:LS454    PU:R_2008_10_02_06_06_12_FLX01080312_retry  LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+@RG ID:3    PL:LS454    PU:R_2008_10_02_06_07_08_rig19_retry    LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+@RG ID:4    PL:LS454    PU:R_2008_10_02_17_50_32_FLX03080339_retry  LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+...
+

The presence of the @RG tags indicate the presence of read groups. Each read group has a SM tag, indicating the sample from which the reads belonging to that read group originate.

+

In addition to the presence of a read group in the header, each read must belong to one and only one read group. Given the following example reads,

+
$ samtools view /path/to/my.bam | grep '^@RG'
+EAS139_44:2:61:681:18781    35  1   1   0   51M =   9   59  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA B<>;==?=?<==?=?=>>?>><=<?=?8<=?>?<:=?>?<==?=>:;<?:= RG:Z:4  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+EAS139_44:7:84:1300:7601    35  1   1   0   51M =   12  62  TAACCCTAAGCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA G<>;==?=?&=>?=?<==?>?<>>?=?<==?>?<==?>?1==@>?;<=><; RG:Z:3  MF:i:18 Aq:i:0  NM:i:1  UQ:i:5  H0:i:0  H1:i:85
+EAS139_44:8:59:118:13881    35  1   1   0   51M =   2   52  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA @<>;<=?=?==>?>?<==?=><=>?-?;=>?:><==?7?;<>?5?<<=>:; RG:Z:1  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+EAS139_46:3:75:1326:2391    35  1   1   0   51M =   12  62  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA @<>==>?>@???B>A>?>A?A>??A?@>?@A?@;??A>@7>?>>@:>=@;@ RG:Z:0  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+...
+

membership in a read group is specified by the RG:Z:* tag. For instance, the first read belongs to read group 4 (sample NA11881), while the last read shown here belongs to read group 0 (sample NA12414).

+
+

8. My BAM file doesn't have read group and sample information. Do I really need it?

+

Yes! Many algorithms in the GATK need to know that certain reads were sequenced together on a specific lane, as they attempt to compensate for variability from one sequencing run to the next. Others need to know that the data represents not just one, but many samples. Without the read group and sample information, the GATK has no way of determining this critical information. You can use Picard's AddOrReplaceReadGroups tool to add read group information.

+
+

11. What's the best way to create a subset of my BAM file containing only reads over a small interval?

+

You can use the GATK to do the following:

+
java -jar GenomeAnalysisTK.jar -R reference.fasta -I full_input.bam -T PrintReads -L chr1:10-20 -o subset_input.bam
+

and you'll get a BAM file containing only reads overlapping those points. This operation retains the complete BAM header from the full file (this was the reference aligned to, after all) so that the BAM remains easy to work with. We routinely use these features for testing and high-performance analysis with the GATK.

\ No newline at end of file diff --git a/doc_archive/faqs/Collected_FAQs_about_interval_lists.md b/doc_archive/faqs/Collected_FAQs_about_interval_lists.md new file mode 100644 index 000000000..cfe8bfe40 --- /dev/null +++ b/doc_archive/faqs/Collected_FAQs_about_interval_lists.md @@ -0,0 +1,40 @@ +## Collected FAQs about interval lists + +http://gatkforums.broadinstitute.org/gatk/discussion/1319/collected-faqs-about-interval-lists + +

1. Can GATK tools be restricted to specific intervals instead of processing the entire reference?

+

Absolutely. Just use the -L argument to provide the list of intervals you wish to run on. Or you can use -XL to exclude intervals, e.g. to blacklist genome regions that are problematic.

+
+

2. What file formats does GATK support for interval lists?

+

GATK supports several types of interval list formats: Picard-style .interval_list, GATK-style .list, BED files with extension .bed, and VCF files.

+

A. Picard-style .interval_list

+

Picard-style interval files have a SAM-like header that includes a sequence dictionary. The intervals are given in the form <chr> <start> <stop> + <target_name>, with fields separated by tabs, and the coordinates are 1-based (first position in the genome is position 1, not position 0).

+
@HD     VN:1.0  SO:coordinate
+@SQ     SN:1    LN:249250621    AS:GRCh37       UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta   M5:1b22b98cdeb4a9304cb5d48026a85128     SP:Homo Sapiens
+@SQ     SN:2    LN:243199373    AS:GRCh37       UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta   M5:a0d9851da00400dec1098a9255ac712e     SP:Homo Sapiens
+1       30366   30503   +       target_1
+1       69089   70010   +       target_2
+1       367657  368599  +       target_3
+1       621094  622036  +       target_4
+1       861320  861395  +       target_5
+1       865533  865718  +       target_6
+

This is the preferred format because the explicit sequence dictionary safeguards against accidental misuse (e.g. apply hg18 intervals to an hg19 BAM file). Note that this file is 1-based, not 0-based (the first position in the genome is position 1).

+

B. GATK-style .list or .intervals

+

This is a simpler format, where intervals are in the form <chr>:<start>-<stop>, and no sequence dictionary is necessary. This file format also uses 1-based coordinates. Note that only the <chr> part is strictly required; if you just want to specify chromosomes/ contigs as opposed to specific coordinate ranges, you don't need to specify the rest. Both <chr>:<start>-<stop> and <chr> can be present in the same file. You can also specify intervals in this format directly at the command line instead of writing them in a file.

+

C. BED files with extension .bed

+

We also accept the widely-used BED format, where intervals are in the form <chr> <start> <stop>, with fields separated by tabs. However, you should be aware that this file format is 0-based for the start coordinates, so coordinates taken from 1-based formats (e.g. if you're cooking up a custom interval list derived from a file in a 1-based format) should be offset by 1. The GATK engine recognizes the .bed extension and interprets the coordinate system accordingly.

+

D. VCF files

+

Yeah, I bet you didn't expect that was a thing! It's very convenient. Say you want to redo a variant calling run on a set of variant calls that you were given by a colleague, but with the latest version of HaplotypeCaller. You just provide the VCF, slap on some padding on the fly using e.g. -ip 100 in the HC command, and boom, done. Each record in the VCF will be interpreted as a single-base interval, and by adding padding you ensure that the caller sees enough context to reevaluate the call appropriately.

+
+

3. Is there a required order of intervals?

+

Yes, thanks for asking. The intervals MUST be sorted by coordinate (in increasing order) within contigs; and the contigs must be sorted in the same order as in the sequence dictionary. This is for efficiency reasons.

+
+

4. Can I provide multiple sets of intervals?

+

Sure, no problem -- just pass them in using separate -L arguments. You can use all the different formats within the same command line. By default, the GATK engine will take the UNION of all the intervals in all the sets. This behavior can be modified by setting an interval_set rule.

+
+

5. How will GATK handle intervals that abut or overlap?

+

Very gracefully. By default the GATK engine will merge any intervals that abut (i.e. they are contiguous, they touch without overlapping) or overlap into a single interval. This behavior can be modified by setting an interval_merging rule.

+
+

6. What's the best way to pad intervals?

+

You can use the -ip engine argument to add padding on the fly. No need to produce separate padded targets files. Sweet, right?

+

Note that if intervals that previously didn't abut or overlap before you added padding now do so, by default the GATK engine will merge them as described above. This behavior can be modified by setting an interval_merging rule.

\ No newline at end of file diff --git a/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md b/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md new file mode 100644 index 000000000..42e51c6f8 --- /dev/null +++ b/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md @@ -0,0 +1,18 @@ +## How can I access the GSA public FTP server? + +http://gatkforums.broadinstitute.org/gatk/discussion/1215/how-can-i-access-the-gsa-public-ftp-server + +

NOTE: This article will be deprecated in the near future as this information will be consolidated elsewhere.

+

We make various files available for public download from the GSA FTP server, such as the GATK resource bundle and presentation slides. We also maintain a public upload feature for processing bug reports from users.

+

There are two logins to choose from depending on whether you want to upload or download something:

+

Downloading

+
location: ftp.broadinstitute.org
+username: gsapubftp-anonymous
+password: <blank>
+

Uploading

+
location: ftp.broadinstitute.org
+username: gsapubftp
+password: 5WvQWSfi
+

Using a browser as FTP client

+

If you use your browser as FTP client, make sure to include the login information in the address, otherwise you will access the general Broad Institute FTP instead of our team FTP. This should work as a direct link (for downloading only):

+

ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle

\ No newline at end of file diff --git a/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md b/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md new file mode 100644 index 000000000..f2fe2215d --- /dev/null +++ b/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md @@ -0,0 +1,14 @@ +## How can I invoke read filters and their arguments? + +http://gatkforums.broadinstitute.org/gatk/discussion/2338/how-can-i-invoke-read-filters-and-their-arguments + +

Most GATK tools apply several read filters by default. You can look up exactly what are the defaults for each tool in their respective Technical Documentation pages.

+

But sometimes you want to specify additional filters yourself (and before you ask, no, you cannot disable the default read filters used by a given tool). This is how you do it:

+

The --read-filter argument (or -rf for short) allows you to apply whatever read filters you'd like. For example, to add the MaxReadLengthFilter filter above to PrintReads, you just add this to your command line:

+
--read_filter MaxReadLength 
+

Notice that when you specify a read filter, you need to strip the Filter part of its name off!

+

The read filter will be applied with its default value (which you can also look up in the Tech Docs for that filter). Now, if you want to specify a different value from the default, you pass the relevant argument by adding this right after the read filter:

+
--read_filter MaxReadLength -maxReadLength 76
+

It's important that you pass the argument right after the filter itself, otherwise the command line parser won't know that they're supposed to go together.

+

And of course, you can add as many filters as you like by using multiple copies of the --read_filter parameter:

+
--read_filter MaxReadLength --maxReadLength 76 --read_filter ZeroMappingQualityRead
\ No newline at end of file diff --git a/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md b/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md new file mode 100644 index 000000000..ac1db50bf --- /dev/null +++ b/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md @@ -0,0 +1,114 @@ +## How can I prepare a FASTA file to use as reference? + +http://gatkforums.broadinstitute.org/gatk/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference + +

This article describes the steps necessary to prepare your reference file (if it's not one that you got from us). As a complement to this article, see the relevant tutorial.

+

Why these steps are necessary

+

The GATK uses two files to access and safety check access to the reference files: a .dict dictionary of the contig names and sizes and a .fai fasta index file to allow efficient random access to the reference bases. You have to generate these files in order to be able to use a Fasta file as reference.

+

NOTE: Picard and samtools treat spaces in contig names differently. We recommend that you avoid using spaces in contig names.

+

Creating the fasta sequence dictionary file

+

We use CreateSequenceDictionary.jar from Picard to create a .dict file from a fasta file.

+
> java -jar CreateSequenceDictionary.jar R= Homo_sapiens_assembly18.fasta O= Homo_sapiens_assembly18.dict
+[Fri Jun 19 14:09:11 EDT 2009] net.sf.picard.sam.CreateSequenceDictionary R= Homo_sapiens_assembly18.fasta O= Homo_sapiens_assembly18.dict
+[Fri Jun 19 14:09:58 EDT 2009] net.sf.picard.sam.CreateSequenceDictionary done.
+Runtime.totalMemory()=2112487424
+44.922u 2.308s 0:47.09 100.2%   0+0k 0+0io 2pf+0w
+

This produces a SAM-style header file describing the contents of our fasta file.

+
> cat Homo_sapiens_assembly18.dict 
+@HD     VN:1.0  SO:unsorted
+@SQ     SN:chrM LN:16571        UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d2ed829b8a1628d16cbeee88e88e39eb
+@SQ     SN:chr1 LN:247249719    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9ebc6df9496613f373e73396d5b3b6b6
+@SQ     SN:chr2 LN:242951149    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:b12c7373e3882120332983be99aeb18d
+@SQ     SN:chr3 LN:199501827    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:0e48ed7f305877f66e6fd4addbae2b9a
+@SQ     SN:chr4 LN:191273063    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:cf37020337904229dca8401907b626c2
+@SQ     SN:chr5 LN:180857866    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:031c851664e31b2c17337fd6f9004858
+@SQ     SN:chr6 LN:170899992    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bfe8005c536131276d448ead33f1b583
+@SQ     SN:chr7 LN:158821424    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:74239c5ceee3b28f0038123d958114cb
+@SQ     SN:chr8 LN:146274826    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:1eb00fe1ce26ce6701d2cd75c35b5ccb
+@SQ     SN:chr9 LN:140273252    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:ea244473e525dde0393d353ef94f974b
+@SQ     SN:chr10        LN:135374737    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:4ca41bf2d7d33578d2cd7ee9411e1533
+@SQ     SN:chr11        LN:134452384    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:425ba5eb6c95b60bafbf2874493a56c3
+@SQ     SN:chr12        LN:132349534    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d17d70060c56b4578fa570117bf19716
+@SQ     SN:chr13        LN:114142980    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:c4f3084a20380a373bbbdb9ae30da587
+@SQ     SN:chr14        LN:106368585    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:c1ff5d44683831e9c7c1db23f93fbb45
+@SQ     SN:chr15        LN:100338915    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:5cd9622c459fe0a276b27f6ac06116d8
+@SQ     SN:chr16        LN:88827254     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:3e81884229e8dc6b7f258169ec8da246
+@SQ     SN:chr17        LN:78774742     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2a5c95ed99c5298bb107f313c7044588
+@SQ     SN:chr18        LN:76117153     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:3d11df432bcdc1407835d5ef2ce62634
+@SQ     SN:chr19        LN:63811651     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2f1a59077cfad51df907ac25723bff28
+@SQ     SN:chr20        LN:62435964     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f126cdf8a6e0c7f379d618ff66beb2da
+@SQ     SN:chr21        LN:46944323     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f1b74b7f9f4cdbaeb6832ee86cb426c6
+@SQ     SN:chr22        LN:49691432     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2041e6a0c914b48dd537922cca63acb8
+@SQ     SN:chrX LN:154913754    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d7e626c80ad172a4d7c95aadb94d9040
+@SQ     SN:chrY LN:57772954     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:62f69d0e82a12af74bad85e2e4a8bd91
+@SQ     SN:chr1_random  LN:1663265      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:cc05cb1554258add2eb62e88c0746394
+@SQ     SN:chr2_random  LN:185571       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:18ceab9e4667a25c8a1f67869a4356ea
+@SQ     SN:chr3_random  LN:749256       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9cc571e918ac18afa0b2053262cadab6
+@SQ     SN:chr4_random  LN:842648       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9cab2949ccf26ee0f69a875412c93740
+@SQ     SN:chr5_random  LN:143687       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:05926bdbff978d4a0906862eb3f773d0
+@SQ     SN:chr6_random  LN:1875562      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d62eb2919ba7b9c1d382c011c5218094
+@SQ     SN:chr7_random  LN:549659       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:28ebfb89c858edbc4d71ff3f83d52231
+@SQ     SN:chr8_random  LN:943810       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:0ed5b088d843d6f6e6b181465b9e82ed
+@SQ     SN:chr9_random  LN:1146434      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:1e3d2d2f141f0550fa28a8d0ed3fd1cf
+@SQ     SN:chr10_random LN:113275       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:50be2d2c6720dabeff497ffb53189daa
+@SQ     SN:chr11_random LN:215294       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bfc93adc30c621d5c83eee3f0d841624
+@SQ     SN:chr13_random LN:186858       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:563531689f3dbd691331fd6c5730a88b
+@SQ     SN:chr15_random LN:784346       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bf885e99940d2d439d83eba791804a48
+@SQ     SN:chr16_random LN:105485       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:dd06ea813a80b59d9c626b31faf6ae7f
+@SQ     SN:chr17_random LN:2617613      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:34d5e2005dffdfaaced1d34f60ed8fc2
+@SQ     SN:chr18_random LN:4262 UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f3814841f1939d3ca19072d9e89f3fd7
+@SQ     SN:chr19_random LN:301858       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:420ce95da035386cc8c63094288c49e2
+@SQ     SN:chr21_random LN:1679693      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:a7252115bfe5bb5525f34d039eecd096
+@SQ     SN:chr22_random LN:257318       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:4f2d259b82f7647d3b668063cf18378b
+@SQ     SN:chrX_random  LN:1719168      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f4d71e0758986c15e5455bf3e14e5d6f
+

Creating the fasta index file

+

We use the faidx command in samtools to prepare the fasta index file. This file describes byte offsets in the fasta file for each contig, allowing us to compute exactly where a particular reference base at contig:pos is in the fasta file.

+
> samtools faidx Homo_sapiens_assembly18.fasta 
+108.446u 3.384s 2:44.61 67.9%   0+0k 0+0io 0pf+0w
+

This produces a text file with one record per line for each of the fasta contigs. Each record is of the: contig, size, location, basesPerLine, bytesPerLine. The index file produced above looks like:

+
> cat Homo_sapiens_assembly18.fasta.fai 
+chrM    16571   6       50      51
+chr1    247249719       16915   50      51
+chr2    242951149       252211635       50      51
+chr3    199501827       500021813       50      51
+chr4    191273063       703513683       50      51
+chr5    180857866       898612214       50      51
+chr6    170899992       1083087244      50      51
+chr7    158821424       1257405242      50      51
+chr8    146274826       1419403101      50      51
+chr9    140273252       1568603430      50      51
+chr10   135374737       1711682155      50      51
+chr11   134452384       1849764394      50      51
+chr12   132349534       1986905833      50      51
+chr13   114142980       2121902365      50      51
+chr14   106368585       2238328212      50      51
+chr15   100338915       2346824176      50      51
+chr16   88827254        2449169877      50      51
+chr17   78774742        2539773684      50      51
+chr18   76117153        2620123928      50      51
+chr19   63811651        2697763432      50      51
+chr20   62435964        2762851324      50      51
+chr21   46944323        2826536015      50      51
+chr22   49691432        2874419232      50      51
+chrX    154913754       2925104499      50      51
+chrY    57772954        3083116535      50      51
+chr1_random     1663265 3142044962      50      51
+chr2_random     185571  3143741506      50      51
+chr3_random     749256  3143930802      50      51
+chr4_random     842648  3144695057      50      51
+chr5_random     143687  3145554571      50      51
+chr6_random     1875562 3145701145      50      51
+chr7_random     549659  3147614232      50      51
+chr8_random     943810  3148174898      50      51
+chr9_random     1146434 3149137598      50      51
+chr10_random    113275  3150306975      50      51
+chr11_random    215294  3150422530      50      51
+chr13_random    186858  3150642144      50      51
+chr15_random    784346  3150832754      50      51
+chr16_random    105485  3151632801      50      51
+chr17_random    2617613 3151740410      50      51
+chr18_random    4262    3154410390      50      51
+chr19_random    301858  3154414752      50      51
+chr21_random    1679693 3154722662      50      51
+chr22_random    257318  3156435963      50      51
+chrX_random     1719168 3156698441      50      51
\ No newline at end of file diff --git a/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md b/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md new file mode 100644 index 000000000..445b18116 --- /dev/null +++ b/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md @@ -0,0 +1,16 @@ +## How can I turn on or customize forum notifications? + +http://gatkforums.broadinstitute.org/gatk/discussion/27/how-can-i-turn-on-or-customize-forum-notifications + +

By default, the forum does not send notification messages about new comments or discussions. If you want to turn on notifications or customize the type of notifications you want to receive (email, popup message etc), you need to do the following: +

+ +

+To specifically get new GATK announcements, scroll down to "Category Notifications" and tick off the "Announcements" category for email notification for discussions (and comments if you really want to know everything). +

\ No newline at end of file diff --git a/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md b/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md new file mode 100644 index 000000000..1816f1fda --- /dev/null +++ b/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md @@ -0,0 +1,164 @@ +## How can I use parallelism to make GATK tools run faster? + +http://gatkforums.broadinstitute.org/gatk/discussion/1975/how-can-i-use-parallelism-to-make-gatk-tools-run-faster + +

This document provides technical details and recommendations on how the parallelism options offered by the GATK can be used to yield optimal performance results.

+

Overview

+

As explained in the primer on parallelism for the GATK, there are two main kinds of parallelism that can be applied to the GATK: multi-threading and scatter-gather (using Queue or Crom/WDL).

+

Multi-threading options

+

There are two options for multi-threading with the GATK, controlled by the arguments -nt and -nct, respectively, which can be combined:

+ +

For more information on how these multi-threading options work, please read the primer on parallelism for the GATK.

+

Memory considerations for multi-threading

+

Each data thread needs to be given the full amount of memory you’d normally give a single run. So if you’re running a tool that normally requires 2 Gb of memory to run, if you use -nt 4, the multithreaded run will use 8 Gb of memory. In contrast, CPU threads will share the memory allocated to their “mother” data thread, so you don’t need to worry about allocating memory based on the number of CPU threads you use.

+

Additional consideration when using -nct with versions 2.2 and 2.3

+

Because of the way the -nct option was originally implemented, in versions 2.2 and 2.3, there is one CPU thread that is reserved by the system to “manage” the rest. So if you use -nct, you’ll only really start seeing a speedup with -nct 3 (which yields two effective "working" threads) and above. This limitation has been resolved in the implementation that will be available in versions 2.4 and up.

+

Scatter-gather

+

For more details on scatter-gather, see the primer on parallelism for the GATK and the documentation on pipelining options.

+

Applicability of parallelism to the major GATK tools

+

Please note that not all tools support all parallelization modes. The parallelization modes that are available for each tool depend partly on the type of traversal that the tool uses to walk through the data, and partly on the nature of the analyses it performs.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ToolFull nameType of traversalNTNCTSG
RTCRealignerTargetCreatorRodWalker+--
IRIndelRealignerReadWalker--+
BRBaseRecalibratorLocusWalker-++
PRPrintReadsReadWalker-+-
RRReduceReadsReadWalker--+
HCHaplotypeCallerActiveRegionWalker-(+)+
UGUnifiedGenotyperLocusWalker+++
+

Note that while HaplotypeCaller supports -nct in principle, many have reported that it is not very stable (random crashes may occur -- but if there is no crash, results will be correct). We prefer not to use this option with HC; use it at your own risk.

+

Recommended configurations

+

The table below summarizes configurations that we typically use for our own projects (one per tool, except we give three alternate possibilities for the UnifiedGenotyper). The different values allocated for each tool reflect not only the technical capabilities of these tools (which options are supported), but also our empirical observations of what provides the best tradeoffs between performance gains and commitment of resources. Please note however that this is meant only as a guide, and that we cannot give you any guarantee that these configurations are the best for your own setup. You will probably have to experiment with the settings to find the configuration that is right for you.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ToolRTCIRBRPRRRHCUG
Available modesNTSGNCT,SGNCTSGNCT,SGNT,NCT,SG
Cluster nodes1441444 / 4 / 4
CPU threads (-nct)1184-8143 / 6 / 24
Data threads (-nt)24111118 / 4 / 1
Memory (Gb)4844441632 / 16 / 4
+

Where NT is data multithreading, NCT is CPU multithreading and SG is scatter-gather using Queue or other data parallelization framework. For more details on scatter-gather, see the primer on parallelism for the GATK and the documentation on pipelining options.

\ No newline at end of file diff --git a/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md b/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md new file mode 100644 index 000000000..09683c57c --- /dev/null +++ b/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md @@ -0,0 +1,36 @@ +## How do I submit a detailed bug report? + +http://gatkforums.broadinstitute.org/gatk/discussion/1894/how-do-i-submit-a-detailed-bug-report + +

Note: only do this if you have been explicitly asked to do so.

+

Scenario:

+

You posted a question about a problem you had with GATK tools, we answered that we think it's a bug, and we asked you to submit a detailed bug report.

+

Here's what you need to provide:

+ +

A snippet file is a slice of the original BAM file which contains the problematic region and is sufficient to reproduce the error. We need it in order to reproduce the problem on our end, which is the first necessary step to finding and fixing the bug. We ask you to provide this as a snippet rather than the full file so that you don't have to upload (and we don't have to process) huge giga-scale files.

+

Here's how you create a snippet file:

+ +

And finally, here's how you send us the files:

+ +

We will get back to you --hopefully with a bug fix!-- as soon as we can.

\ No newline at end of file diff --git a/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md b/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md new file mode 100644 index 000000000..483c6fadc --- /dev/null +++ b/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md @@ -0,0 +1,9 @@ +## How does the GATK handle these huge NGS datasets? + +http://gatkforums.broadinstitute.org/gatk/discussion/1320/how-does-the-gatk-handle-these-huge-ngs-datasets + +

Imagine a simple question like, "What's the depth of coverage at position A of the genome?"

+

First, you are given billions of reads that are aligned to the genome but not ordered in any particular way (except perhaps in the order they were emitted by the sequencer). This simple question is then very difficult to answer efficiently, because the algorithm is forced to examine every single read in succession, since any one of them might span position A. The algorithm must now take several hours in order to compute this value.

+

Instead, imagine the billions of reads are now sorted in reference order (that is to say, on each chromosome, the reads are stored on disk in the same order they appear on the chromosome). Now, answering the question above is trivial, as the algorithm can jump to the desired location, examine only the reads that span the position, and return immediately after those reads (and only those reads) are inspected. The total number of reads that need to be interrogated is only a handful, rather than several billion, and the processing time is seconds, not hours.

+

This reference-ordered sorting enables the GATK to process terabytes of data quickly and without tremendous memory overhead. Most GATK tools run very quickly and with less than 2 gigabytes of RAM. Without this sorting, the GATK cannot operate correctly. Thus, it is a fundamental rule of working with the GATK, which is the reason for the Central Dogma of the GATK:

+

All datasets (reads, alignments, quality scores, variants, dbSNP information, gene tracks, interval lists - everything) must be sorted in order of one of the canonical references sequences.

\ No newline at end of file diff --git a/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md b/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md new file mode 100644 index 000000000..23fd5b24c --- /dev/null +++ b/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md @@ -0,0 +1,25 @@ +## How should I cite GATK in my own publications? + +http://gatkforums.broadinstitute.org/gatk/discussion/6201/how-should-i-cite-gatk-in-my-own-publications + +

To date we have published three papers on GATK (citation details below). The ideal way to cite the GATK is to use all as a triple citation, as in:

+
+

We sequenced 10 samples on 10 lanes on an Illumina HiSeq 2000, aligned the resulting reads to the hg19 reference genome with BWA (Li & Durbin), applied GATK (McKenna et al., 2010) base quality score recalibration, indel realignment, duplicate removal, and performed SNP and INDEL discovery and genotyping across all 10 samples simultaneously using standard hard filtering parameters or variant quality score recalibration according to GATK Best Practices recommendations (DePristo et al., 2011; Van der Auwera et al., 2013).

+
+
+

McKenna et al. 2010 : Original description of the GATK framework

+

The first GATK paper covers the computational philosophy underlying the GATK and is a good citation for the GATK in general.

+

The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA, 2010 GENOME RESEARCH 20:1297-303

+

Article | Pubmed

+
+

DePristo et al. 2011 : First incarnation of the Best Practices workflow

+

The second GATK paper describes in more detail some of the key tools commonly used in the GATK for high-throughput sequencing data processing and variant discovery. The paper covers base quality score recalibration, indel realignment, SNP calling with UnifiedGenotyper, variant quality score recalibration and their application to deep whole genome, whole exome, and low-pass multi-sample calling. This is a good citation if you use the GATK for variant discovery.

+

A framework for variation discovery and genotyping using next-generation DNA sequencing data DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D, Daly M, 2011 NATURE GENETICS 43:491-498

+

Article | Pubmed

+

Note that the workflow described in this paper corresponds to the version 1.x to 2.x best practices. Some key steps for variant discovery have been significantly modified in later versions (3.x onwards). This paper should not be used as a definitive guide to variant discovery with GATK. For that, please see our online documentation guide.

+
+

Van der Auwera et al. 2013 : Hands-on tutorial with step-by-step explanations

+

The third GATK paper describes the Best Practices for Variant Discovery (version 2.x). It is intended mainly as a learning resource for first-time users and as a protocol reference. This is a good citation to include in a Materials and Methods section.

+

From FastQ Data to High-Confidence Variant Calls: The Genome Analysis Toolkit Best Practices Pipeline Van der Auwera GA, Carneiro M, Hartl C, Poplin R, del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella K, Altshuler D, Gabriel S, DePristo M, 2013 CURRENT PROTOCOLS IN BIOINFORMATICS 43:11.10.1-11.10.33

+

Article | PubMed

+

Remember that as our work continues and our Best Practices recommendations evolve, specific command lines, argument values and even tool choices described in the paper become obsolete. Be sure to always refer to our Best Practices documentation for the most up-to-date and version-appropriate recommendations.

\ No newline at end of file diff --git a/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md b/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md new file mode 100644 index 000000000..7379e617c --- /dev/null +++ b/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md @@ -0,0 +1,53 @@ +## How should I pre-process data from multiplexed sequencing and multi-library designs? + +http://gatkforums.broadinstitute.org/gatk/discussion/3060/how-should-i-pre-process-data-from-multiplexed-sequencing-and-multi-library-designs + +

Our Best Practices pre-processing documentation assumes a simple experimental design in which you have one set of input sequence files (forward/reverse or interleaved FASTQ, or unmapped uBAM) per sample, and you run each step of the pre-processing workflow separately for each sample, resulting in one BAM file per sample at the end of this phase.

+

However, if you are generating multiple libraries for each sample, and/or multiplexing samples within and/or across sequencing lanes, the data must be de-multiplexed before pre-processing, typically resulting in multiple sets of FASTQ files per sample all of which should have distinct read group IDs (RGID).

+

At that point there are several different valid strategies for implementing the pre-processing workflow. Here at the Broad Institute, we run the initial steps of the pre-processing workflow (mapping, sorting and marking duplicates) separately on each individual read group. Then we merge the data to produce a single BAM file for each sample (aggregation); this is done by re-running Mark Duplicates, this time on all read group BAM files for a sample at the same time. Then we run Indel Realignment and Base Recalibration on the aggregated per-sample BAM files. See the worked-out example below and this presentation for more details.

+

Note that there are many possible ways to achieve a similar result; here we present the way we think gives the best combination of efficiency and quality. This assumes that you are dealing with one or more samples, and each of them was sequenced on one or more lanes.

+

Example

+

Let's say we have this example data (assuming interleaved FASTQs containing both forward and reverse reads) for two sample libraries, sampleA and sampleB, which were each sequenced on two lanes, lane1 and lane2:

+ +

These will each be identified as separate read groups A1, A2, B1 and B2. If we had multiple libraries per sample, we would further distinguish them (eg sampleA_lib1_lane1.fq leading to read group A11, sampleA_lib2_lane1.fq leading to read group A21 and so on).

+

1. Run initial steps per-readgroup once

+

Assuming that you received one FASTQ file per sample library, per lane of sequence data (which amounts to a read group), run each file through mapping and sorting. During the mapping step you assign read group information, which will be very important in the next steps so be sure to do it correctly. See the read groups dictionary entry for guidance.

+

The example data becomes:

+ +

At this point we mark duplicates in each read group BAM file (dedup), which allows us to estimate the complexity of the corresponding library of origin as a quality control step. This step is optional.

+

The example data becomes:

+ +

Technically this first run of marking duplicates is not necessary because we will run it again per-sample, and that per-sample marking would be enough to achieve the desired result. To reiterate, we only do this round of marking duplicates for QC purposes.

+

2. Merge read groups and mark duplicates per sample (aggregation + dedup)

+

Once you have pre-processed each read group individually, you merge read groups belonging to the same sample into a single BAM file. You can do this as a standalone step, bur for the sake of efficiency we combine this with the per-readgroup duplicate marking step (it's simply a matter of passing the multiple inputs to MarkDuplicates in a single command).

+

The example data becomes:

+ +

To be clear, this is the round of marking duplicates that matters. It eliminates PCR duplicates (arising from library preparation) across all lanes in addition to optical duplicates (which are by definition only per-lane).

+

3. Remaining per-sample pre-processing

+

Then you run indel realignment (optional) and base recalibration (BQSR).

+

The example data becomes:

+ +

Realigning around indels per-sample leads to consistent alignments across all lanes within a sample. This step is only necessary if you will be using a locus-based variant caller like MuTect 1 or UnifiedGenotyper (for legacy reasons). If you will be using HaplotypeCaller or MuTect2, you do not need to perform indel realignment.

+

Base recalibration will be applied per-read group if you assigned appropriate read group information in your data. BaseRecalibrator distinguishes read groups by RGID, or RGPU if it is available (PU takes precedence over ID). This will identify separate read groups (distinguishing both lanes and libraries) as such even if they are in the same BAM file, and it will always process them separately -- as long as the read groups are identified correctly of course. There would be no sense in trying to recalibrate across lanes, since the purpose of this processing step is to compensate for the errors made by the machine during sequencing, and the lane is the base unit of the sequencing machine (assuming the equipment is Illumina HiSeq or similar technology).

+

People often ask also if it's worth the trouble to try realigning across all samples in a cohort. The answer is almost always no, unless you have very shallow coverage. The problem is that while it would be lovely to ensure consistent alignments around indels across all samples, the computational cost gets too ridiculous too fast. That being said, for contrastive calling projects -- such as cancer tumor/normals -- we do recommend realigning both the tumor and the normal together in general to avoid slight alignment differences between the two tissue types.

\ No newline at end of file diff --git a/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md b/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md new file mode 100644 index 000000000..bc462d4e9 --- /dev/null +++ b/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md @@ -0,0 +1,11 @@ +## How should I select samples for a Panel of Normals for somatic analysis? + +http://gatkforums.broadinstitute.org/gatk/discussion/7366/how-should-i-select-samples-for-a-panel-of-normals-for-somatic-analysis + +

The Panel of Normals (PoN) plays two important roles in somatic variant analysis:

+
    +
  1. Exclude germline variant sites that are found in the normals to avoid calling them as potential somatic variants in the tumor;
  2. +
  3. Exclude technical artifacts that arise from particular techniques (eg sample preservation) and technologies (eg library capture, sequencing chemistry).
  4. +
+

Given these roles, the most important selection criteria are the technical properties of how the normal data was generated. It's very important to use normals that are as technically similar as possible to the tumor. Also, the samples should come from subjects that were young and healthy (to minimize the chance of using as normal a sample from someone who has an undiagnosed tumor).

+

If possible it is better to use normals generated from the same type of tissue because if the tissues were preserved differently, the artifact patterns may be different.

\ No newline at end of file diff --git a/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md b/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md new file mode 100644 index 000000000..8a9e826ae --- /dev/null +++ b/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md @@ -0,0 +1,45 @@ +## I'm new to GATK. Where do I start? + +http://gatkforums.broadinstitute.org/gatk/discussion/4863/im-new-to-gatk-where-do-i-start + +

If this is your first rodeo, you're probably asking yourself:

+ \ No newline at end of file diff --git a/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md b/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md new file mode 100644 index 000000000..f73c45b42 --- /dev/null +++ b/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md @@ -0,0 +1,18 @@ +## Lane, Library, Sample and Cohort -- what do they mean and why are they important? + +http://gatkforums.broadinstitute.org/gatk/discussion/3059/lane-library-sample-and-cohort-what-do-they-mean-and-why-are-they-important + +

There are four major organizational units for next-generation DNA sequencing processes that used throughout the GATK documentation:

+ +

Note that many GATK commands can be run at the lane level, but will give better results seeing all of the data for a single sample, or even all of the data for all samples. Unfortunately, there's a trade-off in computational cost, since running these commands across all of your data simultaneously requires much more computing power. Please see the documentation for each step to understand what is the best way to group or partition your data for that particular process.

\ No newline at end of file diff --git a/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md b/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md new file mode 100644 index 000000000..3220168aa --- /dev/null +++ b/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md @@ -0,0 +1,31 @@ +## Should I analyze my samples alone or together? + +http://gatkforums.broadinstitute.org/gatk/discussion/4150/should-i-analyze-my-samples-alone-or-together + +

Together is (almost always) better than alone

+

We recommend performing variant discovery in a way that enables joint analysis of multiple samples, as laid out in our Best Practices workflow. That workflow includes a joint analysis step that empowers variant discovery by providing the ability to leverage population-wide information from a cohort of multiple sample, allowing us to detect variants with great sensitivity and genotype samples as accurately as possible. Our workflow recommendations provide a way to do this in a way that is scalable and allows incremental processing of the sequencing data.

+

The key point is that you don’t actually have to call variants on all your samples together to perform a joint analysis. We have developed a workflow that allows us to decouple the initial identification of potential variant sites (ie variant calling) from the genotyping step, which is the only part that really needs to be done jointly. Since GATK 3.0, you can use the HaplotypeCaller to call variants individually per-sample in -ERC GVCF mode, followed by a joint genotyping step on all samples in the cohort, as described in this method article. This achieves what we call incremental joint discovery, providing you with all the benefits of classic joint calling (as described below) without the drawbacks.

+

Why "almost always"? Because some people have reported missing a small fraction of singletons (variants that are unique to individual samples) when using the new method. For most studies, this is an acceptable tradeoff (which is reduced by the availability of high quality sequencing data), but if you are very specifically looking for singletons, you may need to do some careful evaluation before committing to this method.

+
+

Previously established cohort analysis strategies

+

Until recently, three strategies were available for variant discovery in multiple samples:

+

- single sample calling: sample BAMs are analyzed individually, and individual call sets are combined in a downstream processing step;
+- batch calling: sample BAMs are analyzed in separate batches, and batch call sets are merged in a downstream processing step;
+- joint calling: variants are called simultaneously across all sample BAMs, generating a single call set for the entire cohort.

+

The best of these, from the point of view of variant discovery, was joint calling, because it provided the following benefits:

+

1. Clearer distinction between homozygous reference sites and sites with missing data

+

Batch-calling does not output a genotype call at sites where no member in the batch has evidence for a variant; it is thus impossible to distinguish such sites from locations missing data. In contrast, joint calling emits genotype calls at every site where any individual in the call set has evidence for variation.

+

2. Greater sensitivity for low-frequency variants

+

By sharing information across all samples, joint calling makes it possible to “rescue” genotype calls at sites where a carrier has low coverage but other samples within the call set have a confident variant at that location. However this does not apply to singletons, which are unique to a single sample. To minimize the chance of missing singletons, we increase the cohort size -- so that singletons themselves have less chance of happening in the first place.

+

3. Greater ability to filter out false positives

+

The current approaches to variant filtering (such as VQSR) use statistical models that work better with large amounts of data. Of the three calling strategies above, only joint calling provides enough data for accurate error modeling and ensures that filtering is applied uniformly across all samples.

+

+

Figure 1: Power of joint calling in finding mutations at low coverage sites. The variant allele is present in only two of the N samples, in both cases with such low coverage that the variant is not callable when processed separately. Joint calling allows evidence to be accumulated over all samples and renders the variant callable. (right) Importance of joint calling to square off the genotype matrix, using an example of two disease-relevant variants. Neither sample will have records in a variants-only output file, for different reasons: the first sample is homozygous reference while the second sample has no data. However, merging the results from single sample calling will incorrectly treat both of these samples identically as being non-informative.

+
+

Drawbacks of traditional joint calling (all steps performed multi-sample)

+

There are two major problems with the joint calling strategy.

+

- Scaling & infrastructure
+Joint calling scales very badly -- the calculations involved in variant calling (especially by methods like the HaplotypeCaller’s) become exponentially more computationally costly as you add samples to the cohort. If you don't have a lot of compute available, you run into limitations pretty quickly. Even here at Broad where we have fairly ridiculous amounts of compute available, we can't brute-force our way through the numbers for the larger cohort sizes that we're called on to handle.

+

- The N+1 problem
+When you’re getting a large-ish number of samples sequenced (especially clinical samples), you typically get them in small batches over an extended period of time, and you analyze each batch as it comes in (whether it’s because the analysis is time-sensitive or your PI is breathing down your back). But that’s not joint calling, that’s batch calling, and it doesn’t give you the same significant gains that joint calling can give you. Unfortunately the joint calling approach doesn’t allow for incremental analysis -- every time you get even one new sample sequence, you have to re-call all samples from scratch.

+

Both of these problems are solved by the single-sample calling + joint genotyping workflow.

\ No newline at end of file diff --git a/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md b/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md new file mode 100644 index 000000000..6964713c2 --- /dev/null +++ b/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md @@ -0,0 +1,14 @@ +## Should I use UnifiedGenotyper or HaplotypeCaller to call variants on my data? + +http://gatkforums.broadinstitute.org/gatk/discussion/3151/should-i-use-unifiedgenotyper-or-haplotypecaller-to-call-variants-on-my-data + +

Use HaplotypeCaller!

+

The HaplotypeCaller is a more recent and sophisticated tool than the UnifiedGenotyper. Its ability to call SNPs is equivalent to that of the UnifiedGenotyper, its ability to call indels is far superior, and it is now capable of calling non-diploid samples. It also comprises several unique functionalities such as the reference confidence model (which enables efficient and incremental variant discovery on ridiculously large cohorts) and special settings for RNAseq data.

+

As of GATK version 3.3, we recommend using HaplotypeCaller in all cases, with no exceptions.

+

Caveats for older versions

+

If you are limited to older versions for project continuity, you may opt to use UnifiedGenotyper in the following cases:

+ \ No newline at end of file diff --git a/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md b/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md new file mode 100644 index 000000000..e8fe9ea4d --- /dev/null +++ b/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md @@ -0,0 +1,49 @@ +## What's in the resource bundle and how can I get it? + +http://gatkforums.broadinstitute.org/gatk/discussion/1213/whats-in-the-resource-bundle-and-how-can-i-get-it + +

NOTE: we recently made some changes to the bundle on the FTP server; see the Resource Bundle page for details. In a nutshell: minor directory structure changes, and Hg38 bundle now mirrors the cloud version.

+
+

1. Accessing the bundle

+

See the Resource Bundle page. In a nutshell, there's a Google Cloud bucket and an FTP server. The cloud bucket only has Hg38 resources; the resources for other builds are currently only available through the FTP server. Let us know if you want them on the Cloud too.

+
+

2. Grch38/Hg38 Resources: the soon-to-be Standard Set

+

This contains all the resource files needed for Best Practices short variant discovery in whole-genome sequencing data (WGS). Exome files and itemized resource list coming soon(ish).

+
+

All resources below this are available only on the FTP server, not on the cloud.

+
+

3. b37 Resources: the Standard Data Set pending completion of the Hg38 bundle

+ +

Additionally, these files all have supplementary indices, statistics, and other QC data available.

+
+

4. hg19 Resources: lifted over from b37

+

Includes the UCSC-style hg19 reference along with all lifted over VCF files.

+
+

5. hg18 Resources: lifted over from b37

+

Includes the UCSC-style hg18 reference along with all lifted over VCF files. The refGene track and BAM files are not available. We only provide data files for this genome-build that can be lifted over "easily" from our master b37 repository. Sorry for whatever inconvenience that this might cause.

+

Also includes a chain file to lift over to b37.

+
+

6. b36 Resources: lifted over from b37

+

Includes the 1000 Genomes pilot b36 formatted reference sequence (human_b36_both.fasta) along with all lifted over VCF files. The refGene track and BAM files are not available. We only provide data files for this genome-build that can be lifted over "easily" from our master b37 repository. Sorry for whatever inconvenience that this might cause.

+

Also includes a chain file to lift over to b37.

\ No newline at end of file diff --git a/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md b/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md new file mode 100644 index 000000000..def6d69bf --- /dev/null +++ b/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md @@ -0,0 +1,12 @@ +## What are the prerequisites for running GATK? + +http://gatkforums.broadinstitute.org/gatk/discussion/1852/what-are-the-prerequisites-for-running-gatk + +

1. Operating system

+

The GATK runs natively on most if not all flavors of UNIX, which includes MacOSX, Linux and BSD. It is possible to get it running on Windows using Cygwin, but we don't provide any support nor instructions for that.

+

2. Java 7 / 1.7

+

The GATK is a Java-based program, so you'll need to have Java installed on your machine. The Java version should be at 1.7 (at this time we don't officially support 1.8, and 1.6 no longer works). You can check what version you have by typing java -version at the command line. This article has some more details about what to do if you don't have the right version. Note that at this time we only support the Sun/Oracle Java JDK; OpenJDK is not supported.

+

4. R dependencies

+

Some of the GATK tools produce plots using R, so if you want to get the plots you'll need to have R and Rscript installed, as well as several R libraries. Full details can be found in the Tutorial on installing required software.

+

3. Familiarity with command-line programs

+

The GATK does not have a Graphical User Interface (GUI). You don't open it by clicking on the .jar file; you have to use the Console (or Terminal) to input commands. If this is all new to you, we recommend you first learn about that and follow some online tutorials before trying to use the GATK. It's not difficult but you'll need to learn some jargon and get used to living without a mouse. Trust us, it's a liberating experience :)

\ No newline at end of file diff --git a/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md b/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md new file mode 100644 index 000000000..02ddff86c --- /dev/null +++ b/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md @@ -0,0 +1,11 @@ +## What do I need to do before attending a workshop hands-on session? + +http://gatkforums.broadinstitute.org/gatk/discussion/4610/what-do-i-need-to-do-before-attending-a-workshop-hands-on-session + +

So you're going to a GATK workshop, and you've been selected to participate in a hands-on session? Fantastic! We're looking forward to walking you through some exercises that will help you master the tools. However -- in order to make the best of the time we have together, we'd like to ask you to come prepared. Specifically, if the workshop hosts are not providing machines and you have been asked to bring your own laptop, please complete the following steps:

+

- Download and install all necessary software as described in this tutorial.

+

Note that if you are a Mac user, you may need to install Apple's XCode Tools, which are free but fairly large, so plan ahead because it can take a loooong time to download them if your connection is anything less than super-fast.

+

- Download the tutorial bundle from the link provided by the workshop organizers.

+

This will typically be provided by email two to three weeks before the date of the workshop.

+

At the start of the session, we'll give you handouts with a walkthrough of the session so you can follow along and take notes (highly recommended!).

+

With that, you should be all set. See you soon!

\ No newline at end of file diff --git a/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md b/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md new file mode 100644 index 000000000..47c2a7aa6 --- /dev/null +++ b/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md @@ -0,0 +1,263 @@ +## What do the VariantEval modules do? + +http://gatkforums.broadinstitute.org/gatk/discussion/2361/what-do-the-varianteval-modules-do + +

VariantEval accepts two types of modules: stratification and evaluation modules.

+ +

CpG

+

CpG is a three-state stratification:

+ +

A CpG site is defined as a site where the reference base at a locus is a C and the adjacent reference base in the 3' direction is a G.

+

EvalRod

+

EvalRod is an N-state stratification, where N is the number of eval rods bound to VariantEval.

+

Sample

+

Sample is an N-state stratification, where N is the number of samples in the eval files.

+

Filter

+

Filter is a three-state stratification:

+ +

FunctionalClass

+

FunctionalClass is a four-state stratification:

+ +

CompRod

+

CompRod is an N-state stratification, where N is the number of comp tracks bound to VariantEval.

+

Degeneracy

+

Degeneracy is a six-state stratification:

+ +

See the [http://en.wikipedia.org/wiki/Genetic_code#Degeneracy Wikipedia page on degeneracy] for more information.

+

JexlExpression

+

JexlExpression is an N-state stratification, where N is the number of JEXL expressions supplied to VariantEval. See [[Using JEXL expressions]]

+

Novelty

+

Novelty is a three-state stratification:

+ +

CountVariants

+

CountVariants is an evaluation module that computes the following metrics:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDefinition
nProcessedLociNumber of processed loci
nCalledLociNumber of called loci
nRefLociNumber of reference loci
nVariantLociNumber of variant loci
variantRateVariants per loci rate
variantRatePerBpNumber of variants per base
nSNPsNumber of snp loci
nInsertionsNumber of insertion
nDeletionsNumber of deletions
nComplexNumber of complex loci
nNoCallsNumber of no calls loci
nHetsNumber of het loci
nHomRefNumber of hom ref loci
nHomVarNumber of hom var loci
nSingletonsNumber of singletons
heterozygosityheterozygosity per locus rate
heterozygosityPerBpheterozygosity per base pair
hetHomRatioheterozygosity to homozygosity ratio
indelRateindel rate (insertion count + deletion count)
indelRatePerBpindel rate per base pair
deletionInsertionRatiodeletion to insertion ratio
+

CompOverlap

+

CompOverlap is an evaluation module that computes the following metrics:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDefinition
nEvalSNPsnumber of eval SNP sites
nCompSNPsnumber of comp SNP sites
novelSitesnumber of eval sites outside of comp sites
nVariantsAtCompnumber of eval sites at comp sites (that is, sharing the same locus as a variant in the comp track, regardless of whether the alternate allele is the same)
compRatepercentage of eval sites at comp sites
nConcordantnumber of concordant sites (that is, for the sites that share the same locus as a variant in the comp track, those that have the same alternate allele)
concordantRatethe concordance rate
+

Understanding the output of CompOverlap

+

A SNP in the detection set is said to be 'concordant' if the position exactly matches an entry in dbSNP and the allele is the same. To understand this and other output of CompOverlap, we shall examine a detailed example. First, consider a fake dbSNP file (headers are suppressed so that one can see the important things):

+
 $ grep -v '##' dbsnp.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+ 1       10327   rs112750067     T       C       .       .       ASP;R5;VC=SNP;VP=050000020005000000000100;WGT=1;dbSNPBuildID=132
+

Now, a detection set file with a single sample, where the variant allele is the same as listed in dbSNP:

+
 $ grep -v '##' eval_correct_allele.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT            001-6
+ 1       10327   .       T       C       5168.52 PASS    ...     GT:AD:DP:GQ:PL    0/1:357,238:373:99:3959,0,4059
+

Finally, a detection set file with a single sample, but the alternate allele differs from that in dbSNP:

+
 $ grep -v '##' eval_incorrect_allele.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT            001-6
+ 1       10327   .       T       A       5168.52 PASS    ...     GT:AD:DP:GQ:PL    0/1:357,238:373:99:3959,0,4059
+

Running VariantEval with just the CompOverlap module:

+
 $ java -jar $STING_DIR/dist/GenomeAnalysisTK.jar -T VariantEval \
+        -R /seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta \
+        -L 1:10327 \
+        -B:dbsnp,VCF dbsnp.vcf \
+        -B:eval_correct_allele,VCF eval_correct_allele.vcf \
+        -B:eval_incorrect_allele,VCF eval_incorrect_allele.vcf \
+        -noEV \
+        -EV CompOverlap \
+        -o eval.table
+

We find that the eval.table file contains the following:

+
 $ grep -v '##' eval.table | column -t 
+ CompOverlap  CompRod  EvalRod                JexlExpression  Novelty  nEvalVariants  nCompVariants  novelSites  nVariantsAtComp  compRate      nConcordant  concordantRate
+ CompOverlap  dbsnp    eval_correct_allele    none            all      1              1              0           1                100.00000000  1            100.00000000
+ CompOverlap  dbsnp    eval_correct_allele    none            known    1              1              0           1                100.00000000  1            100.00000000
+ CompOverlap  dbsnp    eval_correct_allele    none            novel    0              0              0           0                0.00000000    0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            all      1              1              0           1                100.00000000  0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            known    1              1              0           1                100.00000000  0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            novel    0              0              0           0                0.00000000    0            0.00000000
+

As you can see, the detection set variant was listed under nVariantsAtComp (meaning the variant was seen at a position listed in dbSNP), but only the eval_correct_allele dataset is shown to be concordant at that site, because the allele listed in this dataset and dbSNP match.

+

TiTvVariantEvaluator

+

TiTvVariantEvaluator is an evaluation module that computes the following metrics:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDefinition
nTinumber of transition loci
nTvnumber of transversion loci
tiTvRatiothe transition to transversion ratio
nTiInCompnumber of comp transition sites
nTvInCompnumber of comp transversion sites
TiTvRatioStandardthe transition to transversion ratio for comp sites
\ No newline at end of file diff --git a/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md b/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md new file mode 100644 index 000000000..675f67bdf --- /dev/null +++ b/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md @@ -0,0 +1,66 @@ +## What input files does the GATK accept / require? + +http://gatkforums.broadinstitute.org/gatk/discussion/1204/what-input-files-does-the-gatk-accept-require + +

All analyses done with the GATK typically involve several (though not necessarily all) of the following inputs:

+ +

This article describes the corresponding file formats that are acceptable for use with the GATK.

+
+

1. Reference Genome Sequence

+

The GATK requires the reference sequence in a single reference sequence in FASTA format, with all contigs in the same file. The GATK requires strict adherence to the FASTA standard. All the standard IUPAC bases are accepted, but keep in mind that non-standard bases (i.e. other than ACGT, such as W for example) will be ignored (i.e. those positions in the genome will be skipped).

+

Some users have reported having issues with reference files that have been stored or modified on Windows filesystems. The issues manifest as "10" characters (corresponding to encoded newlines) inserted in the sequence, which cause the GATK to quit with an error. If you encounter this issue, you will need to re-download a valid master copy of the reference file, or clean it up yourself.

+

Gzipped fasta files will not work with the GATK, so please make sure to unzip them first. Please see this article for more information on preparing FASTA reference sequences for use with the GATK.

+

Important note about human genome reference versions

+

If you are using human data, your reads must be aligned to one of the official b3x (e.g. b36, b37) or hg1x (e.g. hg18, hg19) references. The names and order of the contigs in the reference you used must exactly match that of one of the official references canonical orderings. These are defined by historical karotyping of largest to smallest chromosomes, followed by the X, Y, and MT for the b3x references; the order is thus 1, 2, 3, ..., 10, 11, 12, ... 20, 21, 22, X, Y, MT. The hg1x references differ in that the chromosome names are prefixed with "chr" and chrM appears first instead of last. The GATK will detect misordered contigs (for example, lexicographically sorted) and throw an error. This draconian approach, though unnecessary technically, ensures that all supplementary data provided with the GATK works correctly. You can use ReorderSam to fix a BAM file aligned to a missorted reference sequence.

+

Our Best Practice recommendation is that you use a standard GATK reference from the GATK resource bundle.

+
+

2. Sequencing Reads

+

The only input format for sequence reads that the GATK itself supports is the [Sequence Alignment/Map (SAM)] format. See [SAM/BAM] for more details on the SAM/BAM format as well as Samtools and Picard, two complementary sets of utilities for working with SAM/BAM files.

+

If you don't find the information you need in this section, please see our FAQs on BAM files.

+

If you are starting out your pipeline with raw reads (typically in FASTQ format) you'll need to make sure that when you map those reads to the reference and produce a BAM file, the resulting BAM file is fully compliant with the GATK requirements. See the Best Practices documentation for detailed instructions on how to do this.

+

In addition to being in SAM format, we require the following additional constraints in order to use your file with the GATK:

+ +

Below is an example well-formed SAM field header and fields (with @SQ dictionary truncated to show only the first two chromosomes for brevity):

+
@HD     VN:1.0  GO:none SO:coordinate
+@SQ     SN:1    LN:249250621    AS:NCBI37       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:1b22b98cdeb4a9304cb5d48026a85128
+@SQ     SN:2    LN:243199373    AS:NCBI37       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:a0d9851da00400dec1098a9255ac712e
+@RG     ID:ERR000162    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR000252    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR001684    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR001685    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@PG     ID:GATK TableRecalibration      VN:v2.2.16      CL:Covariates=[ReadGroupCovariate, QualityScoreCovariate, DinucCovariate, CycleCovariate], use_original_quals=true, defau 
+t_read_group=DefaultReadGroup, default_platform=Illumina, force_read_group=null, force_platform=null, solid_recal_mode=SET_Q_ZERO, window_size_nqs=5, homopolymer_nback=7, except on_if_no_tile=false, pQ=5, maxQ=40, smoothing=137       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:b4eb71ee878d3706246b7c1dbef69299
+@PG     ID:bwa  VN:0.5.5
+ERR001685.4315085       16      1       9997    25      35M     *       0       0       CCGATCTCCCTAACCCTAACCCTAACCCTAACCCT     ?8:C7ACAABBCBAAB?CCAABBEBA@ACEBBB@?     XT:A:U  XN:i:4    X0:i:1  X1:i:0  XM:i:2  XO:i:0  XG:i:0  RG:Z:ERR001685  NM:i:6  MD:Z:0N0N0N0N1A0A28     OQ:Z:>>:>2>>>>>>>>>>>>>>>>>>?>>>>??>???>
+ERR001689.1165834       117     1       9997    0       *       =       9997    0       CCGATCTAGGGTTAGGGTTAGGGTTAGGGTTAGGG     >7AA<@@C?@?B?B??>9?B??>A?B???BAB??@     RG:Z:ERR001689    OQ:Z:>:<<8<<<><<><><<>7<>>>?>>??>???????
+ERR001689.1165834       185     1       9997    25      35M     =       9997    0       CCGATCTCCCTAACCCTAACCCTAACCCTAACCCT     758A:?>>8?=@@>>?;4<>=??@@==??@?==?8     XT:A:U  XN:i:4    SM:i:25 AM:i:0  X0:i:1  X1:i:0  XM:i:2  XO:i:0  XG:i:0  RG:Z:ERR001689  NM:i:6  MD:Z:0N0N0N0N1A0A28     OQ:Z:;74>7><><><>>>>><:<>>>>>>>>>>>>>>>>
+ERR001688.2681347       117     1       9998    0       *       =       9998    0       CGATCTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG     5@BA@A6B???A?B??>B@B??>B@B??>BAB???     RG:Z:ERR001688    OQ:Z:=>>>><4><<?><??????????????????????       
+

Note about fixing BAM files with alternative sortings

+

The GATK requires that the BAM file be sorted in the same order as the reference. Unfortunately, many BAM files have headers that are sorted in some other order -- lexicographical order is a common alternative. To resort the BAM file please use ReorderSam.

+
+

3. Intervals of interest

+

The GATK accept interval files for processing subsets of the genome in several different formats. Please see the FAQs on interval lists for details.

+
+

4. Reference Ordered Data (ROD) file formats

+

The GATK can associate arbitrary reference ordered data (ROD) files with named tracks for all tools. Some tools require specific ROD data files for processing, and developers are free to write tools that access arbitrary data sets using the ROD interface. The general ROD system has the following syntax:

+
-argumentName:name,type file
+

Where name is the name in the GATK tool (like "eval" in VariantEval), type is the type of the file, such as VCF or dbSNP, and file is the path to the file containing the ROD data.

+

The GATK supports several common file formats for reading ROD data:

+ +

Note that we no longer support the PED format. See here for converting .ped files to VCF.

+

If you need additional information on VCF files, please see our FAQs on VCF files here and here.

\ No newline at end of file diff --git "a/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md" "b/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md" new file mode 100644 index 000000000..5813aecd2 --- /dev/null +++ "b/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md" @@ -0,0 +1,108 @@ +## What is "Phone Home" and how does it affect me? + +http://gatkforums.broadinstitute.org/gatk/discussion/1250/what-is-phone-home-and-how-does-it-affect-me + +

In GATK versions produced between September 2010 and May 2016, the GATK had a "Phone Home" usage reporting feature that sent us information about each GATK run via the Broad filesystem (within the Broad) and Amazon's S3 cloud storage service (outside the Broad). This feature was enabled by default and required a key to be disabled (for running offline or for regulatory reasons).

+

The Phone Home feature was removed in version 3.6. Keys are no longer necessary, so if you had one, you can stop using it. We do not expect that including Phone Home arguments in GATK command lines would cause any errors (so this should not break any scripts), but let us know if you run into any trouble.

+

Note that keys remain necessary for disabling Phone Home in older versions of GATK. See further below for details on how to obtain a key.

+
+

How Phone Home helped development

+

At the time, the information provided by the Phone Home feature was critical in driving improvements to the GATK:

+ +
+

What information was sent to us

+

Below are two example GATK Run Reports showing exactly what information is sent to us each time the GATK phones home.

+

A successful run:

+
<GATK-run-report>
+    <id>D7D31ULwTSxlAwnEOSmW6Z4PawXwMxEz</id>
+    <start-time>2012/03/10 20.21.19</start-time>
+    <end-time>2012/03/10 20.21.19</end-time>
+    <run-time>0</run-time>
+    <walker-name>CountReads</walker-name>
+    <svn-version>1.4-483-g63ecdb2</svn-version>
+    <total-memory>85000192</total-memory>
+    <max-memory>129957888</max-memory>
+    <user-name>depristo</user-name>
+    <host-name>10.0.1.10</host-name>
+    <java>Apple Inc.-1.6.0_26</java>
+    <machine>Mac OS X-x86_64</machine>
+    <iterations>105</iterations>
+</GATK-run-report>
+

A run where an exception has occurred:

+
<GATK-run-report>
+   <id>yX3AnltsqIlXH9kAQqTWHQUd8CQ5bikz</id>   
+   <exception>
+      <message>Failed to parse Genome Location string: 20:10,000,000-10,000,001x</message>
+      <stacktrace class="java.util.ArrayList"> 
+         <string>org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:377)</string>
+         <string>org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82)</string>
+         <string>org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106)</string>
+         <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618)</string>
+         <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585)</string>
+         <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231)</string>
+         <string>org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128)</string>
+         <string>org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236)</string>
+         <string>org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146)</string>
+         <string>org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92)</string>
+      </stacktrace>
+      <cause>
+         <message>Position: &apos;10,000,001x&apos; contains invalid chars.</message>
+         <stacktrace class="java.util.ArrayList">
+            <string>org.broadinstitute.sting.utils.GenomeLocParser.parsePosition(GenomeLocParser.java:411)</string>
+            <string>org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:374)</string>
+            <string>org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82)</string>
+            <string>org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106)</string>
+            <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618)</string>
+            <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585)</string>
+            <string>org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231)</string>
+            <string>org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128)</string>
+            <string>org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236)</string>
+            <string>org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146)</string>
+            <string>org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92)</string>
+         </stacktrace>
+         <is-user-exception>false</is-user-exception>
+      </cause>
+      <is-user-exception>true</is-user-exception>
+   </exception>
+   <start-time>2012/03/10 20.19.52</start-time>
+   <end-time>2012/03/10 20.19.52</end-time>
+   <run-time>0</run-time>
+   <walker-name>CountReads</walker-name>
+   <svn-version>1.4-483-g63ecdb2</svn-version>
+   <total-memory>85000192</total-memory>
+   <max-memory>129957888</max-memory>
+   <user-name>depristo</user-name>
+   <host-name>10.0.1.10</host-name>
+   <java>Apple Inc.-1.6.0_26</java>
+   <machine>Mac OS X-x86_64</machine>
+   <iterations>0</iterations>
+</GATK-run-report>
+

Note that as of GATK 1.5 we no longer collected information about the command-line executed, the working directory, or tmp directory.

+
+

Disabling Phone Home

+

Versions of GATK older than 3.6 attempted to "phone home" as a normal part of each run. However, we recognized that some of our users need to run the GATK with the Phone Home disabled. To enable this, we provided an option (-et NO_ET ) in GATK 1.5 and later to disable the Phone Home feature. To use this option, you need to contact us to request a key. Instructions for doing so are below.

+

How to obtain and use a GATK key

+

To obtain a GATK key, please fill out the request form.

+

Running the GATK with a key is simple: you just need to append a -K your.key argument to your customary command line, where your.key is the path to the key file you obtained from us:

+
java -jar dist/GenomeAnalysisTK.jar \
+    -T PrintReads \
+    -I public/testdata/exampleBAM.bam \
+    -R public/testdata/exampleFASTA.fasta \
+    -et NO_ET \
+    -K your.key
+

The -K argument is only necessary when running the GATK with the NO_ET option.

+

Troubleshooting key-related problems

+ +

If you get an error message from the GATK saying that your key is corrupt, unreadable, or has been revoked, please apply for a new key.

+ +

If you get an error message stating that the GATK public key could not be located or read, then something is likely wrong with your build of the GATK. If you're running the binary release, try downloading it again. If you're compiling from source, try re-compiling. If all else fails, please ask for help on our community forum.

\ No newline at end of file diff --git "a/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md" "b/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md" new file mode 100644 index 000000000..d0ad65aac --- /dev/null +++ "b/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md" @@ -0,0 +1,34 @@ +## What is GATK-Lite and how does it relate to "full" GATK 2.x? [RETIRED] + +http://gatkforums.broadinstitute.org/gatk/discussion/1720/what-is-gatk-lite-and-how-does-it-relate-to-full-gatk-2-x-retired + +

Please note that GATK-Lite was retired in February 2013 when version 2.4 was released. See the announcement here.

+
+

You probably know by now that GATK-Lite is a free-for-everyone and completely open-source version of the GATK (licensed under the original [MIT license]( http://en.wikipedia.org/wiki/MIT_License)).

+

But what's in the box? What can GATK-Lite do -- or rather, what can it not do that the full version (let's call it GATK-Full) can? And what does that mean exactly, in terms of functionality, reliability and power?

+

To really understand the differences between GATK-Lite and GATK-Full, you need some more information on how the GATK works, and how we work to develop and improve it.

+

First you need to understand what are the two core components of the GATK: the engine and tools (see picture below).

+

As explained here, the engine handles all the common work that's related to data access, conversion and traversal, as well as high-performance computing features. The engine is supported by an infrastructure of software libraries. If the GATK was a car, that would be the engine and chassis. What we call the *tools are attached on top of that, and they provide the various analytical and processing functionalities like variant calling and base or variant recalibration. On your car, that would be headlights, airbags and so on.

+

Core GATK components

+

Second is how we work on developing the GATK, and what it means for how improvements are shared (or not) between Lite and Full.

+

We do all our development work on a single codebase. This means that everything --the engine and all tools-- is on one common workbench. There are not different versions that we work on in parallel -- that would be crazy to manage! That's why the version numbers of GATK-Lite and GATK-Full always match: if the latest GATK-Full version is numbered 2.1-13, then the latest GATK-Lite is also numbered 2.1-13.

+

The most important consequence of this setup is that when we make improvements to the infrastructure and engine, the same improvements will end up in GATK Lite and in GATK Full. So for the purposes of power, speed and robustness of the GATK that is determined by the engine, there is no difference between them.

+

For the tools, it's a little more complicated -- but not much. When we "build" the GATK binaries (the .jar files), we put everything from the workbench into the Full build, but we only put a subset into the Lite build. Note that this Lite subset is pretty big -- it contains all the tools that were previously available in GATK 1.x versions, and always will. We also reserve the right to add previews or not-fully-featured versions of the new tools that are in Full, at our discretion, to the Lite build.

+

So there are two basic types of differences between the tools available in the Lite and Full builds (see picture below).

+
    +
  1. +

    We have a new tool that performs a brand new function (which wasn't available in GATK 1.x), and we only include it in the Full build.

    +
  2. +
  3. We have a tool that has some new add-on capabilities (which weren't possible in GATK 1.x); we put the tool in both the Lite and the Full build, but the add-ons are only available in the Full build.
  4. +
+

Tools in Lite vs. Full

+

Reprising the car analogy, GATK-Lite and GATK-Full are like two versions of the same car -- the basic version and the fully-equipped one. They both have the exact same engine, and most of the equipment (tools) is the same -- for example, they both have the same airbag system, and they both have headlights. But there are a few important differences:

+
    +
  1. +

    The GATK-Full car comes with a GPS (sat-nav for our UK friends), for which the Lite car has no equivalent. You could buy a portable GPS unit from a third-party store for your Lite car, but it might not be as good, and certainly not as convenient, as the Full car's built-in one.

    +
  2. +
  3. Both cars have windows of course, but the Full car has power windows, while the Lite car doesn't. The Lite windows can open and close, but you have to operate them by hand, which is much slower.
  4. +
+

So, to summarize:

+

The underlying engine is exactly the same in both GATK-Lite and GATK-Full. Most functionalities are available in both builds, performed by the same tools. Some functionalities are available in both builds, but they are performed by different tools, and the tool in the Full build is better. New, cutting-edge functionalities are only available in the Full build, and there is no equivalent in the Lite build.

+

We hope this clears up some of the confusion surrounding GATK-Lite. If not, please leave a comment and we'll do our best to clarify further!

\ No newline at end of file diff --git "a/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md" "b/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md" new file mode 100644 index 000000000..bcc0fb08d --- /dev/null +++ "b/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md" @@ -0,0 +1,28 @@ +## What is Map/Reduce and why are GATK tools called "walkers"? + +http://gatkforums.broadinstitute.org/gatk/discussion/1754/what-is-map-reduce-and-why-are-gatk-tools-called-walkers + +

Overview

+

One of the key challenges of working with next-gen sequence data is that input files are usually very large. We can’t just make the program open the files, load all the data into memory and perform whatever analysis is needed on all of it in one go. It’s just too much work, even for supercomputers.

+

Instead, we make the program cut the job into smaller tasks that the computer can easily process separately. Then we have it combine the results of each step into the final result.

+

Map/Reduce

+

Map/Reduce is the technique we use to achieve this. It consists of three steps formally called filter, map and reduce. Let’s apply it to an example case where we want to find out what is the average depth of coverage in our dataset for a certain region of the genome.

+ +

This may seem trivial for such a simple example, but it is a very powerful method with many advantages. Among other things, it makes it relatively easy to parallelize operations, which makes the tools run much faster on large datasets.

+

Walkers, filters and traversal types

+

All the tools in the GATK are built from the ground up to take advantage of this method. That’s why we call them walkers: because they “walk” across the genome, getting things done.

+

Note that even though it’s not included in the Map/Reduce technique’s name, the filter step is very important. It determines what data get presented to the tool for analysis, selecting only the appropriate data for each task and discarding anything that’s not relevant. This is a key part of the Map/Reduce technique, because that’s what makes each task “bite-sized” enough for the computer to handle easily.

+

Each tool has filters that are tailored specifically for the type of analysis it performs. The filters rely on traversal engines, which are little programs that are designed to “traverse” the data (i.e. walk through the data) in specific ways.

+

There are three major types of traversal: Locus Traversal, Read Traversal and Active Region Traversal. In our interval coverage example, the tool’s filter uses the Locus Traversal engine, which walks through the data by locus, i.e. by position along the reference genome. Because of that, the tool is classified as a Locus Walker. Similarly, the Read Traversal engine is used, you’ve guessed it, by Read Walkers.

+

The GATK engine comes packed with many other ways to walk through the genome and get the job done seamlessly, but those are the ones you’ll encounter most often.

+

Further reading

+

A primer on parallelism with the GATK +How can I use parallelism to make GATK tools run faster?

\ No newline at end of file diff --git a/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md b/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md new file mode 100644 index 000000000..85aae8f30 --- /dev/null +++ b/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md @@ -0,0 +1,90 @@ +## What is a GVCF and how is it different from a 'regular' VCF? + +http://gatkforums.broadinstitute.org/gatk/discussion/4017/what-is-a-gvcf-and-how-is-it-different-from-a-regular-vcf + +

Overview

+

GVCF stands for Genomic VCF. A GVCF is a kind of VCF, so the basic format specification is the same as for a regular VCF (see the spec documentation here), but a Genomic VCF contains extra information.

+

This document explains what that extra information is and how you can use it to empower your variants analyses.

+

Important caveat

+

What we're covering here is strictly limited to GVCFs produced by HaplotypeCaller in GATK versions 3.0 and above. The term GVCF is sometimes used simply to describe VCFs that contain a record for every position in the genome (or interval of interest) regardless of whether a variant was detected at that site or not (such as VCFs produced by UnifiedGenotyper with --output_mode EMIT_ALL_SITES). GVCFs produced by HaplotypeCaller 3.x contain additional information that is formatted in a very specific way. Read on to find out more.

+

General comparison of VCF vs. gVCF

+

The key difference between a regular VCF and a gVCF is that the gVCF has records for all sites, whether there is a variant call there or not. The goal is to have every site represented in the file in order to do joint analysis of a cohort in subsequent steps. The records in a gVCF include an accurate estimation of how confident we are in the determination that the sites are homozygous-reference or not. This estimation is generated by the HaplotypeCaller's built-in reference model.

+ +

Note that some other tools (including the GATK's own UnifiedGenotyper) may output an all-sites VCF that looks superficially like the BP_RESOLUTION gVCFs produced by HaplotypeCaller, but they do not provide an accurate estimate of reference confidence, and therefore cannot be used in joint genotyping analyses.

+

The two types of gVCFs

+

As you can see in the figure above, there are two options you can use with -ERC: GVCF and BP_RESOLUTION. With BP_RESOLUTION, you get a gVCF with an individual record at every site: either a variant record, or a non-variant record. With GVCF, you get a gVCF with individual variant records for variant sites, but the non-variant sites are grouped together into non-variant block records that represent intervals of sites for which the genotype quality (GQ) is within a certain range or band. The GQ ranges are defined in the ##GVCFBlock line of the gVCF header. The purpose of the blocks (also called banding) is to keep file size down, and there is no downside for the downstream analysis, so we do recommend using the -GVCF option.

+

Example gVCF file

+

This is a banded gVCF produced by HaplotypeCaller with the -GVCF option.

+

Header:

+

As you can see in the first line, the basic file format is a valid version 4.1 VCF:

+
##fileformat=VCFv4.1
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
+##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
+##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)
+##GVCFBlock=minGQ=60(inclusive),maxGQ=2147483647(exclusive)
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##contig=<ID=20,length=63025520,assembly=b37>
+##reference=file:///humgen/1kg/reference/human_g1k_v37.fasta
+

Toward the middle you see the ##GVCFBlock lines (after the ##FORMAT lines) (repeated here for clarity):

+
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
+##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
+##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)
+

which indicate the GQ ranges used for banding (corresponding to the boundaries [5, 20, 60]).

+

You can also see the definition of the MIN_DP annotation in the ##FORMAT lines.

+

Records

+

The first thing you'll notice, hopefully, is the <NON_REF> symbolic allele listed in every record's ALT field. This provides us with a way to represent the possibility of having a non-reference allele at this site, and to indicate our confidence either way.

+

The second thing to look for is the END tag in the INFO field of non-variant block records. This tells you at what position the block ends. For example, the first line is a non-variant block that starts at position 20:10000000 and ends at 20:10000116.

+
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+20  10000000    .   T   <NON_REF>   .   .   END=10000116    GT:DP:GQ:MIN_DP:PL  0/0:44:99:38:0,89,1385
+20  10000117    .   C   T,<NON_REF> 612.77  .   BaseQRankSum=0.000;ClippingRankSum=-0.411;DP=38;MLEAC=1,0;MLEAF=0.500,0.00;MQ=221.39;MQ0=0;MQRankSum=-2.172;ReadPosRankSum=-0.235   GT:AD:DP:GQ:PL:SB   0/1:17,21,0:38:99:641,0,456,691,519,1210:6,11,11,10
+20  10000118    .   T   <NON_REF>   .   .   END=10000210    GT:DP:GQ:MIN_DP:PL  0/0:42:99:38:0,80,1314
+20  10000211    .   C   T,<NON_REF> 638.77  .   BaseQRankSum=0.894;ClippingRankSum=-1.927;DP=42;MLEAC=1,0;MLEAF=0.500,0.00;MQ=221.89;MQ0=0;MQRankSum=-1.750;ReadPosRankSum=1.549    GT:AD:DP:GQ:PL:SB   0/1:20,22,0:42:99:667,0,566,728,632,1360:9,11,12,10
+20  10000212    .   A   <NON_REF>   .   .   END=10000438    GT:DP:GQ:MIN_DP:PL  0/0:52:99:42:0,99,1403
+20  10000439    .   T   G,<NON_REF> 1737.77 .   DP=57;MLEAC=2,0;MLEAF=1.00,0.00;MQ=221.41;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,56,0:56:99:1771,168,0,1771,168,1771:0,0,0,0
+20  10000440    .   T   <NON_REF>   .   .   END=10000597    GT:DP:GQ:MIN_DP:PL  0/0:56:99:49:0,120,1800
+20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;MLEAC=2,0;MLEAF=1.00,0.00;MQ=185.55;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,53,0:53:99:1788,158,0,1788,158,1788:0,0,0,0
+20  10000599    .   T   <NON_REF>   .   .   END=10000693    GT:DP:GQ:MIN_DP:PL  0/0:51:99:47:0,120,1800
+20  10000694    .   G   A,<NON_REF> 961.77  .   BaseQRankSum=0.736;ClippingRankSum=-0.009;DP=54;MLEAC=1,0;MLEAF=0.500,0.00;MQ=106.92;MQ0=0;MQRankSum=0.482;ReadPosRankSum=1.537 GT:AD:DP:GQ:PL:SB   0/1:21,32,0:53:99:990,0,579,1053,675,1728:9,12,10,22
+20  10000695    .   G   <NON_REF>   .   .   END=10000757    GT:DP:GQ:MIN_DP:PL  0/0:48:99:45:0,120,1800
+20  10000758    .   T   A,<NON_REF> 1663.77 .   DP=51;MLEAC=2,0;MLEAF=1.00,0.00;MQ=59.32;MQ0=0  GT:AD:DP:GQ:PL:SB   1/1:0,50,0:50:99:1697,149,0,1697,149,1697:0,0,0,0
+20  10000759    .   A   <NON_REF>   .   .   END=10001018    GT:DP:GQ:MIN_DP:PL  0/0:40:99:28:0,65,1080
+20  10001019    .   T   G,<NON_REF> 93.77   .   BaseQRankSum=0.058;ClippingRankSum=-0.347;DP=26;MLEAC=1,0;MLEAF=0.500,0.00;MQ=29.65;MQ0=0;MQRankSum=-0.925;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL:SB   0/1:19,7,0:26:99:122,0,494,179,515,694:12,7,4,3
+20  10001020    .   C   <NON_REF>   .   .   END=10001020    GT:DP:GQ:MIN_DP:PL  0/0:26:72:26:0,72,1080
+20  10001021    .   T   <NON_REF>   .   .   END=10001021    GT:DP:GQ:MIN_DP:PL  0/0:25:37:25:0,37,909
+20  10001022    .   C   <NON_REF>   .   .   END=10001297    GT:DP:GQ:MIN_DP:PL  0/0:30:87:25:0,72,831
+20  10001298    .   T   A,<NON_REF> 1404.77 .   DP=41;MLEAC=2,0;MLEAF=1.00,0.00;MQ=171.56;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,41,0:41:99:1438,123,0,1438,123,1438:0,0,0,0
+20  10001299    .   C   <NON_REF>   .   .   END=10001386    GT:DP:GQ:MIN_DP:PL  0/0:43:99:39:0,95,1226
+20  10001387    .   C   <NON_REF>   .   .   END=10001418    GT:DP:GQ:MIN_DP:PL  0/0:41:42:39:0,21,315
+20  10001419    .   T   <NON_REF>   .   .   END=10001425    GT:DP:GQ:MIN_DP:PL  0/0:45:12:42:0,9,135
+20  10001426    .   A   <NON_REF>   .   .   END=10001427    GT:DP:GQ:MIN_DP:PL  0/0:49:0:48:0,0,1282
+20  10001428    .   T   <NON_REF>   .   .   END=10001428    GT:DP:GQ:MIN_DP:PL  0/0:49:21:49:0,21,315
+20  10001429    .   G   <NON_REF>   .   .   END=10001429    GT:DP:GQ:MIN_DP:PL  0/0:47:18:47:0,18,270
+20  10001430    .   G   <NON_REF>   .   .   END=10001431    GT:DP:GQ:MIN_DP:PL  0/0:45:0:44:0,0,1121
+20  10001432    .   A   <NON_REF>   .   .   END=10001432    GT:DP:GQ:MIN_DP:PL  0/0:43:18:43:0,18,270
+20  10001433    .   T   <NON_REF>   .   .   END=10001433    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,1201
+20  10001434    .   G   <NON_REF>   .   .   END=10001434    GT:DP:GQ:MIN_DP:PL  0/0:44:18:44:0,18,270
+20  10001435    .   A   <NON_REF>   .   .   END=10001435    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,1130
+20  10001436    .   A   AAGGCT,<NON_REF>    1845.73 .   DP=43;MLEAC=2,0;MLEAF=1.00,0.00;MQ=220.07;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,42,0:42:99:1886,125,0,1888,126,1890:0,0,0,0
+20  10001437    .   A   <NON_REF>   .   .   END=10001437    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,0
+

Note that toward the end of this snippet, you see multiple consecutive non-variant block records. These were not merged into a single record because the sites they contain belong to different ranges of GQ (which are defined in the header).

\ No newline at end of file diff --git a/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md b/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md new file mode 100644 index 000000000..4d1cae517 --- /dev/null +++ b/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md @@ -0,0 +1,175 @@ +## What is a VCF and how should I interpret it? + +http://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it + +

This document describes "regular" VCF files produced for GERMLINE calls. For information on the special kind of VCF called gVCF, produced by HaplotypeCaller in -ERC GVCF mode, please see this companion document. For information specific to SOMATIC calls, see the MuTect documentation.

+
+

Contents

+
    +
  1. What is VCF?
  2. +
  3. Basic structure of a VCF file
  4. +
  5. Interpreting the VCF file header information
  6. +
  7. Structure of variant call records
  8. +
  9. How the genotype and other sample-level information is represented
  10. +
  11. How to extract information from a VCF in a sane, straightforward way
  12. +
+
+

1. What is VCF?

+

VCF stands for Variant Call Format. It is a standardized text file format for representing SNP, indel, and structural variation calls. The VCF specification used to be maintained by the 1000 Genomes Project, but its management and expansion has been taken over by the Global Alliance for Genomics and Health Data Working group file format team. The full format spec can be found in the Samtools/Hts-specs repository along with other useful specs like SAM/BAM. We highly encourage you to take a look at those documents, as they contain a lot of useful information that we don't go over in this document.

+

VCF is the primary (and only well-supported) format used by the GATK for variant calls. We prefer it above all others because while it can be a bit verbose, the VCF format is very explicit about the exact type and sequence of variation as well as the genotypes of multiple samples for this variation.

+

That being said, this highly detailed information can be challenging to understand. The information provided by the GATK tools that infer variation from high-throughput sequencing data, such as the HaplotypeCaller, is especially complex. This document describes the key features and annotations that you need to know about in order to understand VCF files output by the GATK tools.

+

Note that VCF files are plain text files, so you can open them for viewing or editing in any text editor, with the following caveats:

+ +
+

2. Basic structure of a VCF file

+

A valid VCF file is composed of two main parts: the header, and the variant call records.

+

+

The header contains information about the dataset and relevant reference sources (e.g. the organism, genome build version etc.), as well as definitions of all the annotations used to qualify and quantify the properties of the variant calls contained in the VCF file. The header of VCFs generated by GATK tools also include the command line that was used to generate them. Some other programs also record the command line in the VCF header, but not all do so as it is not required by the VCF specification. For more information about the header, see the next section.

+

The actual data lines will look something like this:

+
[HEADER LINES]
+#CHROM  POS ID      REF ALT QUAL    FILTER  INFO          FORMAT          NA12878
+1   873762  .       T   G   5231.78 PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:173,141:282:99:255,0,255
+1   877664  rs3828047   A   G   3931.66 PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  1/1:0,105:94:99:255,255,0
+1   899282  rs28548431  C   T   71.77   PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:1,3:4:26:103,0,26
+1   974165  rs9442391   T   C   29.84   LowQual [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:14,4:14:61:61,0,255
+

After the header lines and the field names, each line represents a single variant, with various properties of that variant represented in the columns. Note that all the lines shown in the example above describe SNPs (also called SNVs), but other variation could be described, such as indels or CNVs. See the VCF specification for details on how the various types of variations are represented. Depending on how the callset was generated, there may only be records for sites where a variant was identified, or there may also be "invariant" records, ie records for sites where no variation was identified.

+

You will sometimes come across VCFs that have only 8 columns, and contain no FORMAT or sample-specific information. These are called "sites-only" VCFs, and represent variation that has been observed in a population. Generally, information about the population of origin should be included in the header.

+
+

3. Interpreting the VCF file header information

+

The following is a valid VCF header produced by HaplotypeCaller on an example data set (derived from our favorite test sample, NA12878). You can download similar test data from our resource bundle and try looking at it yourself!

+
##fileformat=VCFv4.1
+##FILTER=<ID=LowQual,Description="Low quality">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##GATKCommandLine.HaplotypeCaller=<ID=HaplotypeCaller,Version=3.4-3-gd1ac142,Date="Mon May 18 17:36:4
+.
+.
+.
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##contig=<ID=chr1,length=249250621,assembly=b37>
+##reference=file:human_genome_b37.fasta
+

We're not showing all the lines here, but that's still a lot... so let's break it down into digestible bits. Note that the header lines are always listed in alphabetical order.

+ +

The first line:

+
##fileformat=VCFv4.1
+

tells you the version of the VCF specification to which the file conforms. This may seem uninteresting but it can have some important consequences for how to handle and interpret the file contents. As genomics is a fast moving field, the file formats are evolving fairly rapidly, so some of the encoding conventions change. If you run into unexpected issues while trying to parse a VCF file, be sure to check the version and the spec for any relevant format changes.

+ +

The FILTER lines tell you what filters have been applied to the data. In our test file, one filter has been applied:

+
##FILTER=<ID=LowQual,Description="Low quality">
+

Records that fail any of the filters listed here will contain the ID of the filter (here, LowQual) in its FILTER field (see how records are structured further below).

+ +

These lines define the annotations contained in the FORMAT and INFO columns of the VCF file, which we explain further below. If you ever need to know what an annotation stands for, you can always check the VCF header for a brief explanation.

+ +

The GATKCommandLine lines contain all the parameters that went used by the tool that generated the file. Here, GATKCommandLine.HaplotypeCaller refers to a command line invoking HaplotypeCaller. These parameters include all the arguments that the tool accepts, not just the ones specified explicitly by the user in the command line.

+ +

These contain the contig names, lengths, and which reference assembly was used with the input bam file. This can come in handy when someone gives you a callset but doesn't tell you which reference it was derived from -- remember that for most organisms, there are multiple reference assemblies, and you should always make sure to use the appropriate one!

+

[todo: FAQ on genome builds]

+
+

4. Structure of variant call records

+

For each site record, the information is structured into columns (also called fields) as follows:

+
#CHROM  POS ID  REF ALT     QUAL    FILTER  INFO    FORMAT  NA12878 [other samples...]
+

The first 8 columns of the VCF records (up to and including INFO) represent the properties observed at the level of the variant (or invariant) site. Keep in mind that when multiple samples are represented in a VCF file, some of the site-level annotations represent a summary or average of the values obtained for that site from the different samples.

+

Sample-specific information such as genotype and individual sample-level annotation values are contained in the FORMAT column (9th column) and in the sample-name columns (10th and beyond). In the example above, there is one sample called NA12878; if there were additional samples there would be additional columns to the right. Most programs order the sample columns alphabetically by sample name, but this is not always the case, so be aware that you can't depend on ordering rules for parsing VCF output!

+

Site-level properties and annotations

+

These first 7 fields are required by the VCF format and must be present, although they can be empty (in practice, there has to be a dot, ie . to serve as a placeholder).

+ +

This next field does not have to be present in the VCF.

+ +

Sample-level annotations

+

At this point you've met all the fields up to INFO in this lineup:

+
#CHROM  POS ID  REF ALT     QUAL    FILTER  INFO    FORMAT  NA12878 [other samples...]
+

All the rest is going to be sample-level information. Sample-level annotations are tag-value pairs, like the INFO annotations, but the formatting is a bit different. The short names of the sample-level annotations are recorded in the FORMAT field. The annotation values are then recorded in corresponding order in each sample column (where the sample names are the SM tags identified in the read group data). Typically, you will at minimum have information about the genotype and confidence in the genotype for the sample at each site. See the next section on genotypes for more details.

+
+

5. How the genotype and other sample-level information is represented

+

The sample-level information contained in the VCF (also called "genotype fields") may look a bit complicated at first glance, but they're actually not that hard to interpret once you understand that they're just sets of tags and values.

+

Let's take a look at three of the records shown earlier, simplified to just show the key genotype annotations:

+
1   873762  .       T   G   [CLIPPED] GT:AD:DP:GQ:PL    0/1:173,141:282:99:255,0,255
+1   877664  rs3828047   A   G   [CLIPPED] GT:AD:DP:GQ:PL    1/1:0,105:94:99:255,255,0
+1   899282  rs28548431  C   T   [CLIPPED] GT:AD:DP:GQ:PL    0/1:1,3:4:26:103,0,26
+

Looking at that last column, here is what the tags mean:

+ +

With that out of the way, let's interpret the genotype information for NA12878 at 1:899282.

+
1   899282  rs28548431  C   T   [CLIPPED] GT:AD:DP:GQ:PL    0/1:1,3:4:26:103,0,26
+

At this site, the called genotype is GT = 0/1, which corresponds to the alleles C/T. The confidence indicated by GQ = 26 isn't very good, largely because there were only a total of 4 reads at this site (DP =4), 1 of which was REF (=had the reference base) and 3 of which were ALT (=had the alternate base) (indicated by AD=1,3). The lack of certainty is evident in the PL field, where PL(0/1) = 0 (the normalized value that corresponds to a likelihood of 1.0) as is always the case for the assigned allele, but the next PL is PL(1/1) = 26 (which corresponds to 10^(-2.6), or 0.0025). So although we're pretty sure there's a variant at this site, there's a chance that the genotype assignment is incorrect, and that the subject may in fact not be het (heterozygous) but be may instead be hom-var (homozygous with the variant allele). But either way, it's clear that the subject is definitely not hom-ref (homozygous with the reference allele) since PL(0/0) = 103, which corresponds to 10^(-10.3), a very small number.

+
+

6. How to extract information from a VCF in a sane, (mostly) straightforward way

+

Use VariantsToTable.

+

No, really, don't write your own parser if you can avoid it. This is not a comment on how smart or how competent we think you are -- it's a comment on how annoyingly obtuse and convoluted the VCF format is.

+

Seriously. The VCF format lends itself really poorly to parsing methods like regular expressions, and we hear sob stories all the time from perfectly competent people whose home-brewed parser broke because it couldn't handle a more esoteric feature of the format. We know we broke a bunch of people's scripts when we introduced a new representation for spanning deletions in multisample callsets. OK, we ended up replacing it with a better representation a month later that was a lot less disruptive and more in line with the spirit of the specification -- but the point is, that first version was technically legal by the 4.2 spec, and that sort of thing can happen at any time. So yes, the VCF is a difficult format to work with, and one way to deal with that safely is to not home-brew parsers.

+

(Why are we sticking with it anyway? Because, as Winston Churchill famously put it, VCF is the worst variant call representation, except for all the others.)

\ No newline at end of file diff --git a/doc_archive/faqs/What_is_the_GATKReport_file_format?.md b/doc_archive/faqs/What_is_the_GATKReport_file_format?.md new file mode 100644 index 000000000..d2705e956 --- /dev/null +++ b/doc_archive/faqs/What_is_the_GATKReport_file_format?.md @@ -0,0 +1,63 @@ +## What is the GATKReport file format? + +http://gatkforums.broadinstitute.org/gatk/discussion/1244/what-is-the-gatkreport-file-format + +

A GATKReport is simply a text document that contains well-formatted, easy to read representation of some tabular data. Many GATK tools output their results as GATKReports, so it's important to understand how they are formatted and how you can use them in further analyses.

+

Here's a simple example:

+
#:GATKReport.v1.0:2
+#:GATKTable:true:2:9:%.18E:%.15f:;
+#:GATKTable:ErrorRatePerCycle:The error rate per sequenced position in the reads
+cycle  errorrate.61PA8.7         qualavg.61PA8.7                                         
+0      7.451835696110506E-3      25.474613284804366                                      
+1      2.362777171937477E-3      29.844949954504095                                      
+2      9.087604507451836E-4      32.875909752547310
+3      5.452562704471102E-4      34.498999090081895                                      
+4      9.087604507451836E-4      35.148316651501370                                       
+5      5.452562704471102E-4      36.072234352256190                                       
+6      5.452562704471102E-4      36.121724890829700                                        
+7      5.452562704471102E-4      36.191048034934500                                        
+8      5.452562704471102E-4      36.003457059679770                                       
+
+#:GATKTable:false:2:3:%s:%c:;
+#:GATKTable:TableName:Description
+key    column
+1:1000  T 
+1:1001  A 
+1:1002  C 
+

This report contains two individual GATK report tables. Every table begins with a header for its metadata and then a header for its name and description. The next row contains the column names followed by the data.

+

We provide an R library called gsalib that allows you to load GATKReport files into R for further analysis. Here are four simple steps to getting gsalib, installing it and loading a report.

+

1. Start R (or open RStudio)

+
$ R
+
+R version 2.11.0 (2010-04-22)
+Copyright (C) 2010 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+  Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+

2. Get the gsalib library from CRAN

+

The gsalib library is available on the Comprehensive R Archive Network, so you can just do:

+
> install.packages("gsalib") 
+

From within R (we use RStudio for convenience).

+

In some cases you need to explicitly tell R where to find the library; you can do this as follows:

+
$ cat .Rprofile 
+.libPaths("/path/to/Sting/R/")
+

3. Load the gsalib library

+
> library(gsalib)
+

4. Finally, load the GATKReport file and have fun

+
> d = gsa.read.gatkreport("/path/to/my.gatkreport")
+> summary(d)
+              Length Class      Mode
+CountVariants 27     data.frame list
+CompOverlap   13     data.frame list
\ No newline at end of file diff --git a/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md b/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md new file mode 100644 index 000000000..8b0112bd3 --- /dev/null +++ b/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md @@ -0,0 +1,16 @@ +## What is the difference between QUAL and GQ annotations? + +http://gatkforums.broadinstitute.org/gatk/discussion/4860/what-is-the-difference-between-qual-and-gq-annotations + +

There has been a lot of confusion about the difference between QUAL and GQ, and we hope this FAQ will clarify the difference.

+

The basic difference is that QUAL refers to the variant site whereas GQ refers to a specific sample's GT.

+ +

QUAL (or more importantly, its normalized form, QD) is mostly useful in multisample context. When you are recalibrating a cohort callset, you're going to be looking exclusively at site-level annotations like QD, because at that point what you're looking for is evidence of variation overall. That way you don't rely too much on individual sample calls, which are less robust.

+

In fact, many cohort studies don't even really care about individual genotype assignments, so they only use site annotations for their entire analysis.

+

Conversely, QUAL may seem redundant if you have only one sample. Especially if it has a good GQ (and more importantly, well separated PLs) then admittedly you don't really need to look at the QUAL -- you know what you have. If the GQ is not good, you can typically rely on the PLs to tell you whether you do probably have a variant, but we're just not sure if it's het or hom-var. If hom-ref is also a possibility, the call may be a potential false positive.

+

That said, it is more effective to filter on site-level annotations first, then refine and filter genotypes as appropriate. That's the workflow we recommend, based on years of experience doing this at fairly large scales...

\ No newline at end of file diff --git a/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md b/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md new file mode 100644 index 000000000..4fc8aa59f --- /dev/null +++ b/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md @@ -0,0 +1,35 @@ +## What is the structure of a GATK command? + +http://gatkforums.broadinstitute.org/gatk/discussion/4669/what-is-the-structure-of-a-gatk-command + +

Overview

+

This document describes how GATK commands are structured and how to add arguments to basic command examples.

+
+

Basic java syntax

+

Commands for GATK always follow the same basic syntax:

+
java [Java arguments] -jar GenomeAnalysisTK.jar [GATK arguments]
+

The core of the command is java -jar GenomeAnalysisTK.jar, which starts up the GATK program in a Java Virtual Machine (JVM). Any additional java-specific arguments (such as -Xmx to increase memory allocation) should be inserted between java and -jar, like this:

+
java -Xmx4G -jar GenomeAnalysisTK.jar [GATK arguments]
+

The order of arguments between java and -jar is not important.

+
+

GATK arguments

+

There are two universal arguments that are required for every GATK command (with very few exceptions, the clp-type utilities), -R for Reference (e.g. -R human_b37.fasta) and -T for Tool name (e.g. -T HaplotypeCaller).

+

Additional arguments fall in two categories:

+ +

The ordering of GATK arguments is not important, but we recommend always passing the tool name (-T) and reference (-R) first for consistency. It is also a good idea to consistently order arguments by some kind of logic in order to make it easy to compare different commands over the course of a project. It’s up to you to choose what that logic should be.

+

All available engine and tool-specific arguments are listed in the tool documentation section. Arguments typically have both a long name (prefixed by --) and a short name (prefixed by -). The GATK command line parser recognizes both equally, so you can use whichever you prefer, depending on whether you prefer commands to be more verbose or more succinct.

+

Finally, a note about flags. Flags are arguments that have boolean values, i.e. TRUE or FALSE. They are typically used to enable or disable specific features; for example, --keep_program_records will make certain GATK tools output additional information in the BAM header that would be omitted otherwise. In GATK, all flags are set to FALSE by default, so if you want to set one to TRUE, all you need to do is add the flag name to the command. You don't need to specify an actual value.

+
+

Examples of complete GATK command lines

+

This is a very simple command that runs HaplotypeCaller in default mode on a single input BAM file containing sequence data and outputs a VCF file containing raw variants.

+
java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf
+

If the data is from exome sequencing, we should additionally provide the exome targets using the -L argument:

+
java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf -L exome_intervals.list
+

If we just want to genotype specific sites of interest using known alleles based on results from a previous study, we can change the HaplotypeCaller’s genotyping mode using -gt_mode, provide those alleles using -alleles, and restrict the analysis to just those sites using -L:

+
java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf -L known_alleles.vcf -alleles known_alleles.vcf -gt_mode GENOTYPE_GIVEN_ALLELES
+

For more examples of commands and for specific tool commands, see the tool documentation section.

\ No newline at end of file diff --git a/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md b/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md new file mode 100644 index 000000000..2c88dbd2c --- /dev/null +++ b/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md @@ -0,0 +1,7 @@ +## What is uBAM and why is it better than FASTQ for storing unmapped sequence data? + +http://gatkforums.broadinstitute.org/gatk/discussion/5990/what-is-ubam-and-why-is-it-better-than-fastq-for-storing-unmapped-sequence-data + +

Most sequencing providers generate FASTQ files with the raw unmapped read sequences, so that is the most common form in which the data is input into the mapping step of the pre-processing pipeline. This is not ideal because among other flaws, much of the metadata associated with sequencing runs cannot be stored in FASTQ files, unlike BAM files which can store more information. See this blog post for an overview of the many problems associated with the FASTQ format.

+

At the Broad Institute, we generate unmapped BAM (uBAM) files directly from the Illumina basecalls in order to keep all metadata in one place, and we do not write the data to FASTQ files at any point. This involves a slightly more complex workflow than is shown in the general Best Practices diagram. See this presentation for more details of how this works.

+

In case you're wondering, we still show the FASTQ-based workflow as the default in most of our documentation because it is by far the most commonly-used workflow, and we want to keep the documentation accessible for our more novice users.

\ No newline at end of file diff --git a/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md b/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md new file mode 100644 index 000000000..254c51736 --- /dev/null +++ b/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md @@ -0,0 +1,110 @@ +## What should I use as known variants/sites for running tool X? + +http://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x + +

1. Notes on known sites

+

Why are they important?

+

Each tool uses known sites differently, but what is common to all is that they use them to help distinguish true variants from false positives, which is very important to how these tools work. If you don't provide known sites, the statistical analysis of the data will be skewed, which can dramatically affect the sensitivity and reliability of the results.

+

In the variant calling pipeline, the only tools that do not strictly require known sites are UnifiedGenotyper and HaplotypeCaller.

+

Human genomes

+

If you're working on human genomes, you're in luck. We provide sets of known sites in the human genome as part of our resource bundle, and we can give you specific Best Practices recommendations on which sets to use for each tool in the variant calling pipeline. See the next section for details.

+

Non-human genomes

+

If you're working on genomes of other organisms, things may be a little harder -- but don't panic, we'll try to help as much as we can. We've started a community discussion in the forum on What are the standard resources for non-human genomes? in which we hope people with non-human genomics experience will share their knowledge.

+

And if it turns out that there is as yet no suitable set of known sites for your organisms, here's how to make your own for the purposes of BaseRecalibration: First, do an initial round of SNP calling on your original, unrecalibrated data. Then take the SNPs that you have the highest confidence in and use that set as the database of known SNPs by feeding it as a VCF file to the base quality score recalibrator. Finally, do a real round of SNP calling with the recalibrated data. These steps could be repeated several times until convergence. Good luck!

+

Some experimentation will be required to figure out the best way to find the highest confidence SNPs for use here. Perhaps one could call variants with several different calling algorithms and take the set intersection. Or perhaps one could do a very strict round of filtering and take only those variants which pass the test.

+

2. Recommended sets of known sites per tool

+

Summary table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TooldbSNP 129dbSNP >132Mills indels1KG indelsHapMapOmni
RealignerTargetCreatorXX
IndelRealignerXX
BaseRecalibratorXXX
(UnifiedGenotyper/ HaplotypeCaller)X
VariantRecalibratorXXXX
VariantEvalX
+

RealignerTargetCreator and IndelRealigner

+

These tools require known indels passed with the -known argument to function properly. We use both the following files:

+ +

BaseRecalibrator

+

This tool requires known SNPs and indels passed with the -knownSites argument to function properly. We use all the following files:

+ +

UnifiedGenotyper / HaplotypeCaller

+

These tools do NOT require known sites, but if SNPs are provided with the -dbsnp argument they will use them for variant annotation. We use this file:

+ +

VariantRecalibrator

+

For VariantRecalibrator, please see the FAQ article on VQSR training sets and arguments.

+

VariantEval

+

This tool requires known SNPs passed with the -dbsnp argument to function properly. We use the following file:

+ \ No newline at end of file diff --git a/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md b/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md new file mode 100644 index 000000000..08ec9ff55 --- /dev/null +++ b/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md @@ -0,0 +1,19 @@ +## What types of variants can GATK tools detect / handle? + +http://gatkforums.broadinstitute.org/gatk/discussion/3682/what-types-of-variants-can-gatk-tools-detect-handle + +

The answer depends on what tool we're talking about, and whether we're considering variant discovery or variant manipulation.

+

Variant manipulation

+

GATK variant manipulation tools are able to recognize the following types of alleles:

+ +

Note that SelectVariants, the GATK tool most used for VCF subsetting operations, discriminates strictly between these categories. This means that if you use for example -selectType INDEL to pull out indels, it will only select pure INDEL records, excluding any MIXED records that might include a SNP allele in addition to the insertion or deletion alleles of interest. To include those you would have to also specify selectType MIXED in the same command.

+

Variant discovery

+

The HaplotypeCaller is a sophisticated variant caller that can call different types of variants at the same time. So in addition to SNPs and indels, it is capable of emitting mixed records by default, as well as symbolic representations for e.g. spanning deletions. It does emit physical phasing information, but in its current version, HC is not able to emit MNPs. If you would like to combine contiguous SNPs into MNPs, you will need to use the ReadBackedPhasing tool with the MNP merging function activated. See the tool documentation for details. Our older (and now deprecated) variant caller, UnifiedGenotyper, was even more limited. It only called SNPs and indels, and did so separately (even if you ran in calling mode BOTH, the program performed separate calling operations internally) so it was not able to recognize that SNPs and Indels should be emitted together as a joint record when they occur at the same site.

+

The general release version of GATK is currently not able to detect SVs (structural variations) or CNVs (copy number variations). However, the alpha version of GATK 4 (the next generation of GATK tools) includes tools for performing CNV (copy number variation) analysis in exome data. Let us know if you're interested in trying them out by commenting on this article in the forum.

+

There is also a third-party software package called GenomeSTRiP built on top of GATK that provides SV (structural variation) analysis capabilities.

\ No newline at end of file diff --git a/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md b/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md new file mode 100644 index 000000000..cfaf47176 --- /dev/null +++ b/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md @@ -0,0 +1,74 @@ +## When should I use -L to pass in a list of intervals? + +http://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals + +

The -L argument (short for --intervals) enables you to restrict your analysis to specific intervals instead of running over the whole genome. Using this argument can have important consequences for performance and/or results. Here, we present some guidelines for using it appropriately depending on your experimental design.

+

In a nutshell, if you’re doing:

+

- Whole genome analysis: no need to include intervals
+- Whole exome analysis: you need to provide the list of capture targets (typically genes/exons)
+- Small targeted experiment: you need to provide the targeted interval(s)
+- Troubleshooting: you can run on a specific interval to test parameters or create a data snippet

+

Important notes:

+

Whatever you end up using -L for, keep this in mind: for tools that output a bam or VCF file, the output file will only contain data from the intervals specified by the -L argument. To be clear, we do not recommend using -L with tools that output a bam file since doing so will omit some data from the output.

+

Example Use of -L:

+

-L 20 (for chromosome 20 in b37/b39 build)

+

-L chr20:1-100 (for chromosome 20 positions 1-100 in hg18/hg19 build)

+

Specifying contigs with colons in their names, as occurs for new contigs in GRCh38, requires special handling for GATK versions prior to v3.6. Please use the following workaround.

+

- For example, HLA-A*01:01:01:01 is a new contig in GRCh38. The colons are a new feature of contig naming for GRCh38 from prior assemblies. This has implications for using the -L option of GATK as the option also uses the colon as a delimiter to distinguish between contig and genomic coordinates. +- When defining coordinates of interest for a contig, e.g. positions 1-100 for chr1, we would use -L chr1:1-100. This also works for our HLA contig, e.g. -L HLA-A*01:01:01:01:1-100. +- However, when passing in an entire contig, for contigs with colons in the name, you must add :1+ to the end of the chromosome name as shown below. This ensures that portions of the contig name are appropriately identified as part of the contig name and not genomic coordinates.

+
-L HLA-A*01:01:01:01:1+
+
+

So here’s a little more detail for each experimental design type.

+

Whole genome analysis

+

It is not necessary to use -L in whole genome analysis. You should be interested in the whole genome!

+

Nevertheless, in some cases, you may want to mask out certain contigs (e.g. chrY or non-chromosome contigs) or regions (e.g. centromere). You can do this with -XL, which does the exact opposite of -L; it excludes the provided intervals.

+

Whole exome analysis

+

By definition, exome sequencing data doesn’t cover the entire genome, so many analyses can be restricted to just the capture targets (genes or exons) to save processing time. There are even some analyses which should be restricted to the capture targets because failing to do so can lead to suboptimal results.

+

Note that we recommend adding some “padding” to the intervals in order to include the flanking regions (typically ~100 bp). No need to modify your target list; you can have the GATK engine do it for you automatically using the interval padding argument. This is not required, but if you do use it, you should do it consistently at all steps where you use -L.

+

Below is a step-by-step breakdown of the Best Practices workflow, with a detailed explanation of why -L should or shouldn’t be used with each tool.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tool-L?Why / why not
RealignerTargetCreatorYESFaster since RTC will only look for regions that need to be realigned within the input interval; no time wasted on the rest.
IndelRealignerNOIR will only try to realign the regions output from RealignerTargetCreator, so there is nothing to be gained by providing the capture targets.
BaseRecalibratorYESThis excludes off-target sequences and sequences that may be poorly mapped, which have a higher error rate. Including them could lead to a skewed model and bad recalibration.
PrintReadsNOOutput is a bam file; using -L would lead to lost data.
UnifiedGenotyper/Haplotype CallerYESWe’re only interested in making calls in exome regions; the rest is a waste of time & includes lots of false positives.
Next stepsNONo need since subsequent steps operate on the callset, which was restricted to the exome at the calling step.
+

Small targeted experiments

+

The same guidelines as for whole exome analysis apply except you do not run BQSR on small datasets.

+

Debugging / troubleshooting

+

You can go crazy with -L while troubleshooting! For example, you can just provide an interval at the command line, and the output file will contain the data from that interval.This is really useful when you’re trying to figure out what’s going on in a specific interval (e.g. why HaplotypeCaller is not calling your favorite indel) or what would be the effect of changing a parameter (e.g. what happens to your indel call if you increase the value of -minPruning). This is also what you’d use to generate a file snippet to send us as part of a bug report (except that never happens because GATK has no bugs, ever).

\ No newline at end of file diff --git a/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md b/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md new file mode 100644 index 000000000..e9a4827fc --- /dev/null +++ b/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md @@ -0,0 +1,32 @@ +## Where can I get a gene list in RefSeq format? + +http://gatkforums.broadinstitute.org/gatk/discussion/1329/where-can-i-get-a-gene-list-in-refseq-format + +

1. About the RefSeq Format

+

From the NCBI RefSeq website

+
+

The Reference Sequence (RefSeq) collection aims to provide a comprehensive, integrated, non-redundant, well-annotated set of sequences, including genomic DNA, transcripts, and proteins. RefSeq is a foundation for medical, functional, and diversity studies; they provide a stable reference for genome annotation, gene identification and characterization, mutation and polymorphism analysis (especially RefSeqGene records), expression studies, and comparative analyses.

+
+

2. In the GATK

+

The GATK uses RefSeq in a variety of walkers, from indel calling to variant annotations. There are many file format flavors of ReqSeq; we've chosen to use the table dump available from the UCSC genome table browser.

+

3. Generating RefSeq files

+

Go to the UCSC genome table browser. There are many output options, here are the changes that you'll need to make:

+
clade:    Mammal
+genome:   Human
+assembly: ''choose the appropriate assembly for the reference you're using''
+group:    Genes abd Gene Prediction Tracks
+track:    RefSeq Genes
+table:    refGene
+region:   ''choose the genome option''
+

Choose a good output filename, something like geneTrack.refSeq, and click the get output button. You now have your initial RefSeq file, which will not be sorted, and will contain non-standard contigs. To run with the GATK, contigs other than the standard 1-22,X,Y,MT must be removed, and the file sorted in karyotypic order.

+

4. Running with the GATK

+

You can provide your RefSeq file to the GATK like you would for any other ROD command line argument. The line would look like the following:

+
-[arg]:REFSEQ /path/to/refSeq
+

Using the filename from above.

+

Warning:

+

The GATK automatically adjusts the start and stop position of the records from zero-based half-open intervals (UCSC standard) to one-based closed intervals.

+

For example:

+
The first 19 bases in Chromosome one:
+Chr1:0-19 (UCSC system)
+Chr1:1-19 (GATK)
+

All of the GATK output is also in this format, so if you're using other tools or scripts to process RefSeq or GATK output files, you should be aware of this difference.

\ No newline at end of file diff --git a/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md b/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md new file mode 100644 index 000000000..a5edbf7fc --- /dev/null +++ b/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md @@ -0,0 +1,22 @@ +## Where can I get the GATK source code? + +http://gatkforums.broadinstitute.org/gatk/discussion/4022/where-can-i-get-the-gatk-source-code + +

We distinguish "Classic GATK" (major versions 1 through 3) and GATK 4, the next generation of GATK tools.

+
+

"Classic GATK" (major versions 1 through 3) (current distribution)

+

We provide the current GATK source code through two publicly accessible Github repositories: broadgsa/gatk and broadgsa/gatk-protected.

+

1. broadgsa/gatk

+

This repository contains the code corresponding to the core GATK development framework, including the GATK engine and many utilities, which third-party developers can use to develop their own GATK-based analysis tools. Be advised however that support for development using this framework is being discontinued.

+

All the code in this repository is open-source under the MIT license. The full text of the license can be viewed here.

+

2. broadgsa/gatk-protected

+

This repository contains the code corresponding to the GenomeAnalysisTK.jar file that we distribute to our users, containing the GATK engine and all analysis tools.

+

This includes the code in broadgsa/gatk under the MIT license, plus tools and utilities that are under a more restrictive license that prohibits commercial/for-profit use. Anyone interested in accessing the protected code for commercial/for-profit purposes should contact our licensing department (softwarelicensing@broadinstitute.org) to inquire about licensing terms.

+
+

GATK 4+

+

The code for GATK 4+, currently available as an alpha preview, is accessible through two publicly accessible Github repositories: broadinstitute/gatk and broadinstitute/gatk-protected. The division is also based on having two different licenses, like Classic GATK, but in this case the repositories are complementary; there is no code shared between them.

+

1. broadinstitute/gatk

+

This repository contains the code corresponding to the core GATK 4+ development framework, including the new GATK engine and many utilities, which third-party developers can use to develop their own GATK-based analysis tools. We encourage developers to use this new framework for development and we welcome feedback regarding features and development support.

+

All the code in this repository is open-source under a BSD license. The full text of the license can be viewed here.

+

2. broadinstitute/gatk-protected

+

This repository contains the code for key analysis tools that are covered under a more restrictive license that prohibits commercial/for-profit use. Anyone interested in accessing the protected code for commercial/for-profit purposes should contact our licensing department (softwarelicensing@broadinstitute.org) to inquire about licensing terms.

\ No newline at end of file diff --git a/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md b/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md new file mode 100644 index 000000000..11a848bf7 --- /dev/null +++ b/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md @@ -0,0 +1,43 @@ +## Which datasets should I use for reviewing or benchmarking purposes? + +http://gatkforums.broadinstitute.org/gatk/discussion/1292/which-datasets-should-i-use-for-reviewing-or-benchmarking-purposes + +

New WGS and WEx CEU trio BAM files

+

We have sequenced at the Broad Institute and released to the 1000 Genomes Project the following datasets for the three members of the CEU trio (NA12878, NA12891 and NA12892):

+ +

This is better data to work with than the original DePristo et al. BAMs files, so we recommend you download and analyze these files if you are looking for complete, large-scale data sets to evaluate the GATK or other tools.

+

Here's the rough library properties of the BAMs:

+

CEU trio BAM libraries

+

These data files can be downloaded from the 1000 Genomes DCC

+

NA12878 Datasets from DePristo et al. (2011) Nature Genetics

+

Here are the datasets we used in the GATK paper cited below.

+

DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D and Daly, M (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics. 43:491-498.

+

Some of the BAM and VCF files are currently hosted by the NCBI: +ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20101201_cg_NA12878/

+ +

Please note that we have not collected the indel calls for the paper, as these are only used for filtering SNPs near indels. If you want to call accurate indels, please use the new GATK indel caller in the Unified Genotyper.

+

Warnings

+

Both the GATK and the sequencing technologies have improved significantly since the analyses performed in this paper.

+ +

Obviously, this was an annoyance for us as well, as it would have been nice to use a state-of-the-art data set for the WEx. But we decided to freeze the data used for analysis to actually finish this paper.

+

How do I get the raw FASTQ file from a BAM?

+

If you want the raw, machine output for the data analyzed in the GATK framework paper, obtain the raw BAM files above and convert them from SAM to FASTQ using the Picard tool SamToFastq.

\ No newline at end of file diff --git a/doc_archive/faqs/Which_tools_use_pedigree_information?.md b/doc_archive/faqs/Which_tools_use_pedigree_information?.md new file mode 100644 index 000000000..7a371873c --- /dev/null +++ b/doc_archive/faqs/Which_tools_use_pedigree_information?.md @@ -0,0 +1,13 @@ +## Which tools use pedigree information? + +http://gatkforums.broadinstitute.org/gatk/discussion/37/which-tools-use-pedigree-information + +

There are two types of GATK tools that are able to use pedigree (family structure) information:

+

Tools that require a pedigree to operate

+

PhaseByTransmission and CalculateGenotypePosterior will not run without a properly formatted pedigree file. These tools are part of the Genotype Refinement workflow, which is documented here.

+

Tools that are able to generate standard variant annotations

+

The two variant callers (HaplotypeCaller and the deprecated UnifiedGenotyper) as well as VariantAnnotator and GenotypeGVCFs are all able to use pedigree information if you request an annotation that involves population structure (e.g. Inbreeding Coefficient). To be clear though, the pedigree information is not used during the variant calling process; it is only used during the annotation step at the end.

+

If you already have VCF files that were called without pedigree information, and you want to add pedigree-related annotations (e.g to use Variant Quality Score Recalibration (VQSR) with the InbreedingCoefficient as a feature annotation), don't panic. Just run the latest version of the VariantAnnotator to re-annotate your variants, requesting any missing annotations, and make sure you pass your PED file to the VariantAnnotator as well. If you forget to provide the pedigree file, the tool will run successfully but pedigree-related annotations may not be generated (this behavior is different in some older versions).

+

About the PED format

+

The PED files used as input for these tools are based on PLINK pedigree files. The general description can be found here.

+

For these tools, the PED files must contain only the first 6 columns from the PLINK format PED file, and no alleles, like a FAM file in PLINK.

\ No newline at end of file diff --git a/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md b/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md new file mode 100644 index 000000000..b406088ae --- /dev/null +++ b/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md @@ -0,0 +1,136 @@ +## Which training sets / arguments should I use for running VQSR? + +http://gatkforums.broadinstitute.org/gatk/discussion/1259/which-training-sets-arguments-should-i-use-for-running-vqsr + +

This document describes the resource datasets and arguments that we recommend for use in the two steps of VQSR (i.e. the successive application of VariantRecalibrator and ApplyRecalibration), based on our work with human genomes, to comply with the GATK Best Practices. The recommendations detailed in this document take precedence over any others you may see elsewhere in our documentation (e.g. in Tutorial articles, which are only meant to illustrate usage, or in past presentations, which may be out of date).

+

The document covers:

+ +

These recommendations are valid for use with calls generated by both the UnifiedGenotyper and HaplotypeCaller. In the past we made a distinction in how we processed the calls from these two callers, but now we treat them the same way. These recommendations will probably not work properly on calls generated by other (non-GATK) callers.

+

Note that VQSR must be run twice in succession in order to build a separate error model for SNPs and INDELs (see the VQSR documentation for more details).

+
+

Explanation of resource datasets

+

The human genome training, truth and known resource datasets mentioned in this document are all available from our resource bundle.

+

If you are working with non-human genomes, you will need to find or generate at least truth and training resource datasets with properties corresponding to those described below. To generate your own resource set, one idea is to first do an initial round of SNP calling and only use those SNPs which have the highest quality scores. These sites which have the most confidence are probably real and could be used as truth data to help disambiguate the rest of the variants in the call set. Another idea is to try using several SNP callers in addition to the UnifiedGenotyper or HaplotypeCaller, and use those sites which are concordant between the different methods as truth data. In either case, you'll need to assign your set a prior likelihood that reflects your confidence in how reliable it is as a truth set. We recommend Q10 as a starting value, which you can then experiment with to find the most appropriate value empirically. There are many possible avenues of research here. Hopefully the model reporting plots that are generated by the recalibration tools will help facilitate this experimentation.

+

Resources for SNPs

+ +

Resources for Indels

+ +
+

Important notes about annotations

+

Some of the annotations included in the recommendations given below might not be the best for your particular dataset. In particular, the following caveats apply:

+ +
+

Important notes for exome capture experiments

+

In our testing we've found that in order to achieve the best exome results one needs to use an exome SNP and/or indel callset with at least 30 samples. For users with experiments containing fewer exome samples there are several options to explore:

+ +
+

Argument recommendations for VariantRecalibrator

+

The variant quality score recalibrator builds an adaptive error model using known variant sites and then applies this model to estimate the probability that each variant is a true genetic variant or a machine artifact. One major improvement from previous recommended protocols is that hand filters do not need to be applied at any point in the process now. All filtering criteria are learned from the data itself.

+

Common, base command line

+

This is the first part of the VariantRecalibrator command line, to which you need to add either the SNP-specific recommendations or the indel-specific recommendations given further below.

+
+java -Xmx4g -jar GenomeAnalysisTK.jar \
+   -T VariantRecalibrator \
+   -R path/to/reference/human_g1k_v37.fasta \
+   -input raw.input.vcf \
+   -recalFile path/to/output.recal \
+   -tranchesFile path/to/output.tranches \
+   -nt 4 \
+   [SPECIFY TRUTH AND TRAINING SETS] \
+   [SPECIFY WHICH ANNOTATIONS TO USE IN MODELING] \
+   [SPECIFY WHICH CLASS OF VARIATION TO MODEL] \
+
+

SNP specific recommendations

+

For SNPs we use both HapMap v3.3 and the Omni chip array from the 1000 Genomes Project as training data. In addition we take the highest confidence SNPs from the project's callset. These datasets are available in the GATK resource bundle.

+
+   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+   -resource:omni,known=false,training=true,truth=true,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf \
+   -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -an InbreedingCoeff \
+   -mode SNP \
+
+

Please note that these recommendations are formulated for whole-genome datasets. For exomes, we do not recommend using DP for variant recalibration (see below for details of why).

+

Note also that, for the above to work, the input vcf needs to be annotated with the corresponding values (QD, FS, DP, etc.). If any of these values are somehow missing, then VariantAnnotator needs to be run first so that VariantRecalibration can run properly.

+

Also, using the provided sites-only truth data files is important here as parsing the genotypes for VCF files with many samples increases the runtime of the tool significantly.

+

You may notice that these recommendations no longer include the --numBadVariants argument. That is because we have removed this argument from the tool, as the VariantRecalibrator now determines the number of variants to use for modeling "bad" variants internally based on the data.

+

Indel specific recommendations

+

When modeling indels with the VQSR we use a training dataset that was created at the Broad by strictly curating the (Mills, Devine, Genome Research, 2011) dataset as as well as adding in very high confidence indels from the 1000 Genomes Project. This dataset is available in the GATK resource bundle.

+
+   --maxGaussians 4 \
+   -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills_and_1000G_gold_standard.indels.b37.sites.vcf \
+   -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+   -an QD -an DP -an FS -an SOR -an ReadPosRankSum -an MQRankSum -an InbreedingCoeff \
+   -mode INDEL \
+
+

Note that indels use a different set of annotations than SNPs. Most annotations related to mapping quality have been removed since there is a conflation with the length of an indel in a read and the degradation in mapping quality that is assigned to the read by the aligner. This covariation is not necessarily indicative of being an error in the same way that it is for SNPs.

+

You may notice that these recommendations no longer include the --numBadVariants argument. That is because we have removed this argument from the tool, as the VariantRecalibrator now determines the number of variants to use for modeling "bad" variants internally based on the data.

+
+

Argument recommendations for ApplyRecalibration

+

The power of the VQSR is that it assigns a calibrated probability to every putative mutation in the callset. The user is then able to decide at what point on the theoretical ROC curve their project wants to live. Some projects, for example, are interested in finding every possible mutation and can tolerate a higher false positive rate. On the other hand, some projects want to generate a ranked list of mutations that they are very certain are real and well supported by the underlying data. The VQSR provides the necessary statistical machinery to effectively apply this sensitivity/specificity tradeoff.

+

Common, base command line

+

This is the first part of the ApplyRecalibration command line, to which you need to add either the SNP-specific recommendations or the indel-specific recommendations given further below.

+
 
+ java -Xmx3g -jar GenomeAnalysisTK.jar \
+   -T ApplyRecalibration \
+   -R reference/human_g1k_v37.fasta \
+   -input raw.input.vcf \
+   -tranchesFile path/to/input.tranches \
+   -recalFile path/to/input.recal \
+   -o path/to/output.recalibrated.filtered.vcf \
+   [SPECIFY THE DESIRED LEVEL OF SENSITIVITY TO TRUTH SITES] \
+   [SPECIFY WHICH CLASS OF VARIATION WAS MODELED] \
+ 
+

SNP specific recommendations

+

For SNPs we used HapMap 3.3 and the Omni 2.5M chip as our truth set. We typically seek to achieve 99.5% sensitivity to the accessible truth sites, but this is by no means universally applicable: you will need to experiment to find out what tranche cutoff is right for your data. Generally speaking, projects involving a higher degree of diversity in terms of world populations can expect to achieve a higher truth sensitivity than projects with a smaller scope.

+
+   --ts_filter_level 99.5 \
+   -mode SNP \
+
+

Indel specific recommendations

+

For indels we use the Mills / 1000 Genomes indel truth set described above. We typically seek to achieve 99.0% sensitivity to the accessible truth sites, but this is by no means universally applicable: you will need to experiment to find out what tranche cutoff is right for your data. Generally speaking, projects involving a higher degree of diversity in terms of world populations can expect to achieve a higher truth sensitivity than projects with a smaller scope.

+
+   --ts_filter_level 99.0 \
+   -mode INDEL \
+
\ No newline at end of file diff --git a/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md b/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md new file mode 100644 index 000000000..3f25c00ba --- /dev/null +++ b/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md @@ -0,0 +1,13 @@ +## Why are some of the annotation values different with VariantAnnotator compared to UG or HC? + +http://gatkforums.broadinstitute.org/gatk/discussion/1550/why-are-some-of-the-annotation-values-different-with-variantannotator-compared-to-ug-or-hc + +

As featured in this forum question.

+

Two main things account for these kinds of differences, both linked to default behaviors of the tools:

+ +

In both cases, you can end up looking at different sets or numbers of reads, which causes some of the annotation values to be different. It's usually not a cause for alarm. Remember that many of these annotations should be interpreted relatively, not absolutely.

\ No newline at end of file diff --git a/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md b/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md new file mode 100644 index 000000000..8cf8ec84e --- /dev/null +++ b/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md @@ -0,0 +1,230 @@ +## Base Quality Score Recalibration (BQSR) + +http://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr + +

BQSR stands for Base Quality Score Recalibration. In a nutshell, it is a data pre-processing step that detects systematic errors made by the sequencer when it estimates the quality score of each base call. This document starts with a high-level overview of the purpose of this method; deeper technical are provided further down.

+

Note that this base recalibration process (BQSR) should not be confused with variant recalibration (VQSR), which is a sophisticated filtering technique applied on the variant callset produced in a later step. The developers who named these methods wish to apologize sincerely to any Spanish-speaking users who might get awfully confused at this point.

+
+

Wait, what are base quality scores again?

+

These scores are per-base estimates of error emitted by the sequencing machines; they express how confident the machine was that it called the correct base each time. For example, let's say the machine reads an A nucleotide, and assigns a quality score of Q20 -- in Phred-scale, that means it's 99% sure it identified the base correctly. This may seem high, but it does mean that we can expect it to be wrong in one case out of 100; so if we have several billion basecalls (we get ~90 billion in a 30x genome), at that rate the machine would make the wrong call in 900 million bases. In practice each basecall gets its own quality score, determined through some dark magic jealously guarded by the manufacturer of the sequencer.

+

Variant calling algorithms rely heavily on the quality score assigned to the individual base calls in each sequence read. This is because the quality score tells us how much we can trust that particular observation to inform us about the biological truth of the site where that base aligns. If we have a basecall that has a low quality score, that means we're not sure we actually read that A correctly, and it could actually be something else. So we won't trust it as much as other base calls that have higher qualities. In other words we use that score to weigh the evidence that we have for or against a variant allele existing at a particular site.

+

Okay, so what is base recalibration?

+

Unfortunately the scores produced by the machines are subject to various sources of systematic (non-random) technical error, leading to over- or under-estimated base quality scores in the data. Some of these errors are due to the physics or the chemistry of how the sequencing reaction works, and some are probably due to manufacturing flaws in the equipment.

+

Base quality score recalibration (BQSR) is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. For example we can identify that, for a given run, whenever we called two A nucleotides in a row, the next base we called had a 1% higher rate of error. So any base call that comes after AA in a read should have its quality score reduced by 1%. We do that over several different covariates (mainly sequence context and position in read, or cycle) in a way that is additive. So the same base may have its quality score increased for one reason and decreased for another.

+

This allows us to get more accurate base qualities overall, which in turn improves the accuracy of our variant calls. To be clear, we can't correct the base calls themselves, i.e. we can't determine whether that low-quality A should actually have been a T -- but we can at least tell the variant caller more accurately how far it can trust that A. Note that in some cases we may find that some bases should have a higher quality score, which allows us to rescue observations that otherwise may have been given less consideration than they deserve. Anecdotally my impression is that sequencers are more often over-confident than under-confident, but we do occasionally see runs from sequencers that seemed to suffer from low self-esteem.

+

Fantastic! How does it work?

+

The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model. The known variants are used to mask out bases at sites of real (expected) variation, to avoid counting real variants as errors. Outside of the masked sites, every mismatch is counted as an error. The rest is mostly accounting.

+

There is an optional but highly recommended step that involves building a second model and generating before/after plots to visualize the effects of the recalibration process. This is useful for quality control purposes.

+
+

More detailed information

+

Detailed information about command line options for BaseRecalibrator can be found here.

+

The tools in this package recalibrate base quality scores of sequencing-by-synthesis reads in an aligned BAM file. After recalibration, the quality scores in the QUAL field in each read in the output BAM are more accurate in that the reported quality score is closer to its actual probability of mismatching the reference genome. Moreover, the recalibration tool attempts to correct for variation in quality with machine cycle and sequence context, and by doing so provides not only more accurate quality scores but also more widely dispersed ones. The system works on BAM files coming from many sequencing platforms: Illumina, SOLiD, 454, Complete Genomics, Pacific Biosciences, etc.

+

This process is accomplished by analyzing the covariation among several features of a base. For example: +

+ +

These covariates are then subsequently applied through a piecewise tabular correction to recalibrate the quality scores of all reads in a BAM file. +

For example, pre-calibration a file could contain only reported Q25 bases, which seems good. However, it may be that these bases actually mismatch the reference at a 1 in 100 rate, so are actually Q20. These higher-than-empirical quality scores provide false confidence in the base calls. Moreover, as is common with sequencing-by-synthesis machine, base mismatches with the reference occur at the end of the reads more frequently than at the beginning. Also, mismatches are strongly associated with sequencing context, in that the dinucleotide AC is often much lower quality than TG. The recalibration tool will not only correct the average Q inaccuracy (shifting from Q25 to Q20) but identify subsets of high-quality bases by separating the low-quality end of read bases AC bases from the high-quality TG bases at the start of the read. See below for examples of pre and post corrected values. +

The system was designed for (sophisticated) users to be able to easily add new covariates to the calculations. For users wishing to add their own covariate simply look at QualityScoreCovariate.java for an idea of how to implement the required interface. Each covariate is a Java class which implements the org.broadinstitute.sting.gatk.walkers.recalibration.Covariate interface. Specifically, the class needs to have a getValue method defined which looks at the read and associated sequence context and pulls out the desired information such as machine cycle. +

+

Running the tools

+

BaseRecalibrator

+

Detailed information about command line options for BaseRecalibrator can be found here. +

This GATK processing step walks over all of the reads in my_reads.bam and tabulates data about the following features of the bases: +

+ +

For each bin, we count the number of bases within the bin and how often such bases mismatch the reference base, excluding loci known to vary in the population, according to dbSNP. After running over all reads, BaseRecalibrator produces a file called my_reads.recal_data.grp, which contains the data needed to recalibrate reads. The format of this GATK report is described below. +

+

Creating a recalibrated BAM

+

To create a recalibrated BAM you can use GATK's PrintReads with the engine on-the-fly recalibration capability. Here is a typical command line to do so: +

+
 
+java -jar GenomeAnalysisTK.jar \
+   -T PrintReads \
+   -R reference.fasta \
+   -I input.bam \
+   -BQSR recalibration_report.grp \
+   -o output.bam
+
+

After computing covariates in the initial BAM File, we then walk through the BAM file again and rewrite the quality scores (in the QUAL field) using the data in the recalibration_report.grp file, into a new BAM file. +

+

This step uses the recalibration table data in recalibration_report.grp produced by BaseRecalibration to recalibrate the quality scores in input.bam, and writing out a new BAM file output.bam with recalibrated QUAL field values. +

+

Effectively the new quality score is:

+ +

Following recalibration, the read quality scores are much closer to their empirical scores than before. This means they can be used in a statistically robust manner for downstream processing, such as SNP calling. In additional, by accounting for quality changes by cycle and sequence context, we can identify truly high quality bases in the reads, often finding a subset of bases that are Q30 even when no bases were originally labeled as such. +

+

Miscellaneous information

+ +

Example pre and post recalibration results

+ +

+ + + +

+

The output of the BaseRecalibrator

+ +

Note that the BasRecalibrator no longer produces plots; this is now done by the AnalyzeCovariates tool.

+

The Recalibration Report

+

The recalibration report is a [GATKReport](http://gatk.vanillaforums.com/discussion/1244/what-is-a-gatkreport) and not only contains the main result of the analysis, but it is also used as an input to all subsequent analyses on the data. The recalibration report contains the following 5 tables: +

+ +

Arguments Table

+

This is the table that contains all the arguments used to run BQSRv2 for this dataset. This is important for the on-the-fly recalibration step to use the same parameters used in the recalibration step (context sizes, covariates, ...). +

+

Example Arguments table:

+
 
+#:GATKTable:true:1:17::;
+#:GATKTable:Arguments:Recalibration argument collection values used in this run
+Argument                    Value
+covariate                   null
+default_platform            null
+deletions_context_size      6
+force_platform              null
+insertions_context_size     6
+...
+
+

Quantization Table

+

The GATK offers native support to quantize base qualities. The GATK quantization procedure uses a statistical approach to determine the best binning system that minimizes the error introduced by amalgamating the different qualities present in the specific dataset. When running BQSRv2, a table with the base counts for each base quality is generated and a 'default' quantization table is generated. This table is a required parameter for any other tool in the GATK if you want to quantize your quality scores. +

+

The default behavior (currently) is to use no quantization when performing on-the-fly recalibration. You can override this by using the engine argument -qq. With -qq 0 you don't quantize qualities, or -qq N you recalculate the quantization bins using N bins on the fly. Note that quantization is completely experimental now and we do not recommend using it unless you are a super advanced user. +

+

Example Arguments table: +

+
 
+#:GATKTable:true:2:94:::;
+#:GATKTable:Quantized:Quality quantization map
+QualityScore  Count        QuantizedScore
+0                     252               0
+1                   15972               1
+2                  553525               2
+3                 2190142               9
+4                 5369681               9
+9                83645762               9
+...
+
+

ReadGroup Table

+

This table contains the empirical quality scores for each read group, for mismatches insertions and deletions. This is not different from the table used in the old table recalibration walker. +

+
 
+#:GATKTable:false:6:18:%s:%s:%.4f:%.4f:%d:%d:;
+#:GATKTable:RecalTable0:
+ReadGroup  EventType  EmpiricalQuality  EstimatedQReported  Observations  Errors
+SRR032768  D                   40.7476             45.0000    2642683174    222475
+SRR032766  D                   40.9072             45.0000    2630282426    213441
+SRR032764  D                   40.5931             45.0000    2919572148    254687
+SRR032769  D                   40.7448             45.0000    2850110574    240094
+SRR032767  D                   40.6820             45.0000    2820040026    241020
+SRR032765  D                   40.9034             45.0000    2441035052    198258
+SRR032766  M                   23.2573             23.7733    2630282426  12424434
+SRR032768  M                   23.0281             23.5366    2642683174  13159514
+SRR032769  M                   23.2608             23.6920    2850110574  13451898
+SRR032764  M                   23.2302             23.6039    2919572148  13877177
+SRR032765  M                   23.0271             23.5527    2441035052  12158144
+SRR032767  M                   23.1195             23.5852    2820040026  13750197
+SRR032766  I                   41.7198             45.0000    2630282426    177017
+SRR032768  I                   41.5682             45.0000    2642683174    184172
+SRR032769  I                   41.5828             45.0000    2850110574    197959
+SRR032764  I                   41.2958             45.0000    2919572148    216637
+SRR032765  I                   41.5546             45.0000    2441035052    170651
+SRR032767  I                   41.5192             45.0000    2820040026    198762
+
+

Quality Score Table

+

This table contains the empirical quality scores for each read group and original quality score, for mismatches insertions and deletions. This is not different from the table used in the old table recalibration walker. +

+
 
+#:GATKTable:false:6:274:%s:%s:%s:%.4f:%d:%d:;
+#:GATKTable:RecalTable1:
+ReadGroup  QualityScore  EventType  EmpiricalQuality  Observations  Errors
+SRR032767            49  M                   33.7794          9549        3
+SRR032769            49  M                   36.9975          5008        0
+SRR032764            49  M                   39.2490          8411        0
+SRR032766            18  M                   17.7397      16330200   274803
+SRR032768            18  M                   17.7922      17707920   294405
+SRR032764            45  I                   41.2958    2919572148   216637
+SRR032765             6  M                    6.0600       3401801   842765
+SRR032769            45  I                   41.5828    2850110574   197959
+SRR032764             6  M                    6.0751       4220451  1041946
+SRR032767            45  I                   41.5192    2820040026   198762
+SRR032769             6  M                    6.3481       5045533  1169748
+SRR032768            16  M                   15.7681      12427549   329283
+SRR032766            16  M                   15.8173      11799056   309110
+SRR032764            16  M                   15.9033      13017244   334343
+SRR032769            16  M                   15.8042      13817386   363078
+...
+
+

Covariates Table

+

This table has the empirical qualities for each covariate used in the dataset. The default covariates are cycle and context. In the current implementation, context is of a fixed size (default 6). Each context and each cycle will have an entry on this table stratified by read group and original quality score. +

+
 
+#:GATKTable:false:8:1003738:%s:%s:%s:%s:%s:%.4f:%d:%d:;
+#:GATKTable:RecalTable2:
+ReadGroup  QualityScore  CovariateValue  CovariateName  EventType  EmpiricalQuality  Observations  Errors
+SRR032767            16  TACGGA          Context        M                   14.2139           817      30
+SRR032766            16  AACGGA          Context        M                   14.9938          1420      44
+SRR032765            16  TACGGA          Context        M                   15.5145           711      19
+SRR032768            16  AACGGA          Context        M                   15.0133          1585      49
+SRR032764            16  TACGGA          Context        M                   14.5393           710      24
+SRR032766            16  GACGGA          Context        M                   17.9746          1379      21
+SRR032768            45  CACCTC          Context        I                   40.7907        575849      47
+SRR032764            45  TACCTC          Context        I                   43.8286        507088      20
+SRR032769            45  TACGGC          Context        D                   38.7536         37525       4
+SRR032768            45  GACCTC          Context        I                   46.0724        445275      10
+SRR032766            45  CACCTC          Context        I                   41.0696        575664      44
+SRR032769            45  TACCTC          Context        I                   43.4821        490491      21
+SRR032766            45  CACGGC          Context        D                   45.1471         65424       1
+SRR032768            45  GACGGC          Context        D                   45.3980         34657       0
+SRR032767            45  TACGGC          Context        D                   42.7663         37814       1
+SRR032767            16  AACGGA          Context        M                   15.9371          1647      41
+SRR032764            16  GACGGA          Context        M                   18.2642          1273      18
+SRR032769            16  CACGGA          Context        M                   13.0801          1442      70
+SRR032765            16  GACGGA          Context        M                   15.9934          1271      31
+...
+
+

Troubleshooting

+

The memory requirements of the recalibrator will vary based on the type of JVM running the application and the number of read groups in the input bam file.

+

If the application reports 'java.lang.OutOfMemoryError: Java heap space', increase the max heap size provided to the JVM by adding ' -Xmx????m' to the jvm_args variable in RecalQual.py, where '????' is the maximum available memory on the processing computer.

+

I've tried recalibrating my data using a downloaded file, such as NA12878 on 454, and apply the table to any of the chromosome BAM files always fails due to hitting my memory limit. I've tried giving it as much as 15GB but that still isn't enough.

+

All of our big merged files for 454 are running with -Xmx16000m arguments to the JVM -- it's enough to process all of the files. 32GB might make the 454 runs a lot faster though.

+

I have a recalibration file calculated over the entire genome (such as for the 1000 genomes trio) but I split my file into pieces (such as by chromosome). Can the recalibration tables safely be applied to the per chromosome BAM files?

+

Yes they can. The original tables needed to be calculated over the whole genome but they can be applied to each piece of the data set independently.

+

I'm working on a genome that doesn't really have a good SNP database yet. I'm wondering if it still makes sense to run base quality score recalibration without known SNPs.

+

The base quality score recalibrator treats every reference mismatch as indicative of machine error. True polymorphisms are legitimate mismatches to the reference and shouldn't be counted against the quality of a base. We use a database of known polymorphisms to skip over most polymorphic sites. Unfortunately without this information the data becomes almost completely unusable since the quality of the bases will be inferred to be much much lower than it actually is as a result of the reference-mismatching SNP sites.

+

However, all is not lost if you are willing to experiment a bit. You can bootstrap a database of known SNPs. Here's how it works:

+ +

Downsampling to reduce run time

+

For users concerned about run time please note this small analysis below showing the approximate number of reads per read group that are required to achieve a given level of recalibration performance. The analysis was performed with 51 base pair Illumina reads on pilot data from the 1000 Genomes Project. Downsampling can be achieved by specifying a genome interval using the -L option. For users concerned only with recalibration accuracy please disregard this plot and continue to use all available data when generating the recalibration table. +

+

+

\ No newline at end of file diff --git a/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md new file mode 100644 index 000000000..1d8cafd98 --- /dev/null +++ b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md @@ -0,0 +1,38 @@ +## Best Practices for Variant Discovery in DNAseq + +http://gatkforums.broadinstitute.org/gatk/discussion/3238/best-practices-for-variant-discovery-in-dnaseq + +This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set. +

This is our recommended workflow for calling variants in DNAseq data from cohorts of samples, in which steps from data processing up to variant calling are performed per-sample, and subsequent steps are performed jointly on all the individuals in the cohort.

+

+

The workflow is divided in three main sections that are meant to be performed sequentially:

+ +
+

Pre-Processing

+

The data generated by the sequencers are put through some pre-processing steps to make it suitable for variant calling analysis. The steps involved are: Mapping and Marking Duplicates; Local Realignment Around Indels; and Base Quality Score Recalibration (BQSR); performed in that order.

+

Mapping and Marking Duplicates

+

The sequence reads are first mapped to the reference using BWA mem to produce a file in SAM/BAM format sorted by coordinate. The next step is to mark duplicates. The rationale here is that during the sequencing process, the same DNA molecules can be sequenced several times. The resulting duplicate reads are not informative and should not be counted as additional evidence for or against a putative variant. The duplicate marking process identifies these reads as such so that the GATK tools know they should ignore them.

+

Realignment Around Indels

+

Next, local realignment is performed around indels, because the algorithms that are used in the initial mapping step tend to produce various types of artifacts. For example, reads that align on the edges of indels often get mapped with mismatching bases that might look like evidence for SNPs, but are actually mapping artifacts. The realignment process identifies the most consistent placement of the reads relative to the indel in order to clean up these artifacts. It occurs in two steps: first the program identifies intervals that need to be realigned, then in the second step it determines the optimal consensus sequence and performs the actual realignment of reads.

+

Base Quality Score Recalibration

+

Finally, base quality scores are recalibrated, because the variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores produced by the machines are subject to various sources of systematic error, leading to over- or under-estimated base quality scores in the data. Base quality score recalibration is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. This yields more accurate base qualities, which in turn improves the accuracy of the variant calls. The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model.

+
+

Variant Discovery

+

Once the data has been pre-processed as described above, it is put through the variant discovery process, i.e. the identification of sites where the data displays variation relative to the reference genome, and calculation of genotypes for each sample at that site. Because some of the variation observed is caused by mapping and sequencing artifacts, the greatest challenge here is to balance the need for sensitivity (to minimize false negatives, i.e. failing to identify real variants) vs. specificity (to minimize false positives, i.e. failing to reject artifacts). It is very difficult to reconcile these objectives in a single step, so instead the variant discovery process is decomposed into separate steps: variant calling (performed per-sample), joint genotyping (performed per-cohort) and variant filtering (also performed per-cohort). The first two steps are designed to maximize sensitivity, while the filtering step aims to deliver a level of specificity that can be customized for each project.

+

Per-Sample Variant Calling

+

We perform variant calling by running the HaplotypeCaller on each sample BAM file (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample gVCFs. If there are more than a few hundred samples, we combine the gVCFs in batches of ~200 gVCFs using a specialized tool, CombineGVCFs. This will make the next step more tractable and reflects that the processing bottleneck lies with the number of input files and not the number of samples in those files.

+

Joint Genotyping

+

All available samples are then jointly genotyped by taking the gVCFs produced earlier and running GenotypeGVCFs on all of them together to create a set of raw SNP and indel calls. This cohort-wide analysis empowers sensitive detection of variants even at difficult sites.

+

Variant Quality Score Recalibration

+

Variant recalibration involves using a machine learning method to assign a well-calibrated probability to each variant call in a raw call set. We can then use this variant quality score in the second step to filter the raw call set, thus producing a subset of calls with our desired level of quality, fine-tuned to balance specificity and sensitivity.

+
+

Refinement and evaluation

+

In this last section, we perform some refinement steps on the genotype calls (GQ estimation and transmission phasing), add functional annotations if desired, and do some quality evaluation by comparing the callset to known resources. None of these steps are absolutely required, and the workflow may need to be adapted quite a bit to each project's requirements.

+
+Important note on GATK versions + +The [Best Practices](http://www.broadinstitute.org/gatk/guide/best-practices) have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section) to ensure that you are using the appropriate recommendations for your version. \ No newline at end of file diff --git a/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md new file mode 100644 index 000000000..ef6b4d239 --- /dev/null +++ b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md @@ -0,0 +1,41 @@ +## Best Practices for Variant Discovery in RNAseq + +http://gatkforums.broadinstitute.org/gatk/discussion/4067/best-practices-for-variant-discovery-in-rnaseq + +This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set. +

This is our recommended workflow for calling variants in RNAseq data from single samples, in which all steps are performed per-sample. In future we will provide cohort analysis recommendations, but these are not yet available.

+

+

The workflow is divided in three main sections that are meant to be performed sequentially:

+ +

Compared to the DNAseq Best Practices, the key adaptations for calling variants in RNAseq focus on handling splice junctions correctly, which involves specific mapping and pre-processing procedures, as well as some new functionality in the HaplotypeCaller, which are highlighted in the figure below.

+

+
+

Pre-Processing

+

The data generated by the sequencers are put through some pre-processing steps to make it suitable for variant calling analysis. The steps involved are: Mapping and Marking Duplicates; Split'N'Trim; Local Realignment Around Indels (optional); and Base Quality Score Recalibration (BQSR); performed in that order.

+

Mapping and Marking Duplicates

+

The sequence reads are first mapped to the reference using STAR aligner (2-pass protocol) to produce a file in SAM/BAM format sorted by coordinate. The next step is to mark duplicates. The rationale here is that during the sequencing process, the same DNA molecules can be sequenced several times. The resulting duplicate reads are not informative and should not be counted as additional evidence for or against a putative variant. The duplicate marking process identifies these reads as such so that the GATK tools know they should ignore them.

+

Split'N'Trim

+

Then, an RNAseq-specific step is applied: reads with N operators in the CIGAR strings (which denote the presence of a splice junction) are split into component reads and trimmed to remove any overhangs into splice junctions, which reduces the occurrence of artifacts. At this step, we also reassign mapping qualities from 255 (assigned by STAR) to 60 which is more meaningful for GATK tools.

+

Realignment Around Indels

+

Next, local realignment is performed around indels, because the algorithms that are used in the initial mapping step tend to produce various types of artifacts. For example, reads that align on the edges of indels often get mapped with mismatching bases that might look like evidence for SNPs, but are actually mapping artifacts. The realignment process identifies the most consistent placement of the reads relative to the indel in order to clean up these artifacts. It occurs in two steps: first the program identifies intervals that need to be realigned, then in the second step it determines the optimal consensus sequence and performs the actual realignment of reads. This step is considered optional for RNAseq.

+

Base Quality Score Recalibration

+

Finally, base quality scores are recalibrated, because the variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores produced by the machines are subject to various sources of systematic error, leading to over- or under-estimated base quality scores in the data. Base quality score recalibration is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. This yields more accurate base qualities, which in turn improves the accuracy of the variant calls. The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model.

+
+

Variant Discovery

+

Once the data has been pre-processed as described above, it is put through the variant discovery process, i.e. the identification of sites where the data displays variation relative to the reference genome, and calculation of genotypes for each sample at that site. Because some of the variation observed is caused by mapping and sequencing artifacts, the greatest challenge here is to balance the need for sensitivity (to minimize false negatives, i.e. failing to identify real variants) vs. specificity (to minimize false positives, i.e. failing to reject artifacts). It is very difficult to reconcile these objectives in a single step, so instead the variant discovery process is decomposed into separate steps: variant calling (performed per-sample) and variant filtering (also performed per-sample). The first step is designed to maximize sensitivity, while the filtering step aims to deliver a level of specificity that can be customized for each project.

+

Our current recommendation for RNAseq is to run all these steps per-sample. At the moment, we do not recommend applying the GVCF-based workflow to RNAseq data because although there is no obvious obstacle to doing so, we have not validated that configuration. Therefore, we cannot guarantee the quality of results that this would produce.

+

Per-Sample Variant Calling

+

We perform variant calling by running the HaplotypeCaller on each sample BAM file (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample VCFs containing raw SNP and indel calls.

+

Per-Sample Variant Filtering

+

For RNAseq, it is not appropriate to apply variant recalibration in its present form. Instead, we provide hard-filtering recommendations to filter variants based on specific annotation value thresholds. This produces a VCF of calls annotated with fiiltering information that can then be used in downstream analyses.

+
+

Refinement and evaluation

+

In this last section, we perform some refinement steps on the genotype calls (GQ estimation and transmission phasing), add functional annotations if desired, and do some quality evaluation by comparing the callset to known resources. None of these steps are absolutely required, and the workflow may need to be adapted quite a bit to each project's requirements.

+
+Important note on GATK versions + +The [Best Practices](http://www.broadinstitute.org/gatk/guide/best-practices) have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section) to ensure that you are using the appropriate recommendations for your version. \ No newline at end of file diff --git a/doc_archive/methods/Calling_variants_in_RNAseq.md b/doc_archive/methods/Calling_variants_in_RNAseq.md new file mode 100644 index 000000000..4dad469d1 --- /dev/null +++ b/doc_archive/methods/Calling_variants_in_RNAseq.md @@ -0,0 +1,80 @@ +## Calling variants in RNAseq + +http://gatkforums.broadinstitute.org/gatk/discussion/3891/calling-variants-in-rnaseq + +

Overview

+

This document describes the details of the GATK Best Practices workflow for SNP and indel calling on RNAseq data.

+

Please note that any command lines are only given as example of how the tools can be run. You should always make sure you understand what is being done at each step and whether the values are appropriate for your data. To that effect, you can find more guidance here.

+

+

In brief, the key modifications made to the DNAseq Best Practices focus on handling splice junctions correctly, which involves specific mapping and pre-processing procedures, as well as some new functionality in the HaplotypeCaller. Here is a detailed overview:

+

+

Caveats

+

Please keep in mind that our DNA-focused Best Practices were developed over several years of thorough experimentation, and are continuously updated as new observations come to light and the analysis methods improve. We have been working with RNAseq for a somewhat shorter time, so there are many aspects that we still need to examine in more detail before we can be fully confident that we are doing the best possible thing.

+

We know that the current recommended pipeline is producing both false positives (wrong variant calls) and false negatives (missed variants) errors. While some of those errors are inevitable in any pipeline, others are errors that we can and will address in future versions of the pipeline. A few examples of such errors are given in this article as well as our ideas for fixing them in the future.

+

We will be improving these recommendations progressively as we go, and we hope that the research community will help us by providing feedback of their experiences applying our recommendations to their data.

+
+

The workflow

+

1. Mapping to the reference

+

The first major difference relative to the DNAseq Best Practices is the mapping step. For DNA-seq, we recommend BWA. For RNA-seq, we evaluated all the major software packages that are specialized in RNAseq alignment, and we found that we were able to achieve the highest sensitivity to both SNPs and, importantly, indels, using STAR aligner. Specifically, we use the STAR 2-pass method which was described in a recent publication (see page 43 of the Supplemental text of the Pär G Engström et al. paper referenced below for full protocol details -- we used the suggested protocol with the default parameters). In brief, in the STAR 2-pass approach, splice junctions detected in a first alignment run are used to guide the final alignment.

+

Here is a walkthrough of the STAR 2-pass alignment steps:

+

1) STAR uses genome index files that must be saved in unique directories. The human genome index was built from the FASTA file hg19.fa as follows:

+
genomeDir=/path/to/hg19
+mkdir $genomeDir
+STAR --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles hg19.fa\  --runThreadN <n>
+

2) Alignment jobs were executed as follows:

+
runDir=/path/to/1pass
+mkdir $runDir
+cd $runDir
+STAR --genomeDir $genomeDir --readFilesIn mate1.fq mate2.fq --runThreadN <n>
+

3) For the 2-pass STAR, a new index is then created using splice junction information contained in the file SJ.out.tab from the first pass:

+
genomeDir=/path/to/hg19_2pass
+mkdir $genomeDir
+STAR --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles hg19.fa \
+    --sjdbFileChrStartEnd /path/to/1pass/SJ.out.tab --sjdbOverhang 75 --runThreadN <n>
+

4) The resulting index is then used to produce the final alignments as follows:

+
runDir=/path/to/2pass
+mkdir $runDir
+cd $runDir
+STAR --genomeDir $genomeDir --readFilesIn mate1.fq mate2.fq --runThreadN <n>
+

2. Add read groups, sort, mark duplicates, and create index

+

The above step produces a SAM file, which we then put through the usual Picard processing steps: adding read group information, sorting, marking duplicates and indexing.

+
java -jar picard.jar AddOrReplaceReadGroups I=star_output.sam O=rg_added_sorted.bam SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample 
+
+java -jar picard.jar MarkDuplicates I=rg_added_sorted.bam O=dedupped.bam  CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics 
+

3. Split'N'Trim and reassign mapping qualities

+

Next, we use a new GATK tool called SplitNCigarReads developed specially for RNAseq, which splits reads into exon segments (getting rid of Ns but maintaining grouping information) and hard-clip any sequences overhanging into the intronic regions.

+ +

In the future we plan to integrate this into the GATK engine so that it will be done automatically where appropriate, but for now it needs to be run as a separate step.

+

At this step we also add one important tweak: we need to reassign mapping qualities, because STAR assigns good alignments a MAPQ of 255 (which technically means “unknown” and is therefore meaningless to GATK). So we use the GATK’s ReassignOneMappingQuality read filter to reassign all good alignments to the default value of 60. This is not ideal, and we hope that in the future RNAseq mappers will emit meaningful quality scores, but in the meantime this is the best we can do. In practice we do this by adding the ReassignOneMappingQuality read filter to the splitter command.

+

Finally, be sure to specify that reads with N cigars should be allowed. This is currently still classified as an "unsafe" option, but this classification will change to reflect the fact that this is now a supported option for RNAseq processing.

+
java -jar GenomeAnalysisTK.jar -T SplitNCigarReads -R ref.fasta -I dedupped.bam -o split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS
+

4. Indel Realignment (optional)

+

After the splitting step, we resume our regularly scheduled programming... to some extent. We have found that performing realignment around indels can help rescue a few indels that would otherwise be missed, but to be honest the effect is marginal. So while it can’t hurt to do it, we only recommend performing the realignment step if you have compute and time to spare (or if it’s important not to miss any potential indels).

+

5. Base Recalibration

+

We do recommend running base recalibration (BQSR). Even though the effect is also marginal when applied to good quality data, it can absolutely save your butt in cases where the qualities have systematic error modes.

+

Both steps 4 and 5 are run as described for DNAseq (with the same known sites resource files), without any special arguments. Finally, please note that you should NOT run ReduceReads on your RNAseq data. The ReduceReads tool will no longer be available in GATK 3.0.

+

6. Variant calling

+

Finally, we have arrived at the variant calling step! Here, we recommend using HaplotypeCaller because it is performing much better in our hands than UnifiedGenotyper (our tests show that UG was able to call less than 50% of the true positive indels that HC calls). We have added some functionality to the variant calling code which will intelligently take into account the information about intron-exon split regions that is embedded in the BAM file by SplitNCigarReads. In brief, the new code will perform “dangling head merging” operations and avoid using soft-clipped bases (this is a temporary solution) as necessary to minimize false positive and false negative calls. To invoke this new functionality, just add -dontUseSoftClippedBases to your regular HC command line. Note that the -recoverDanglingHeads argument which was previously required is no longer necessary as that behavior is now enabled by default in HaplotypeCaller. Also, we found that we get better results if we set the minimum phred-scaled confidence threshold for calling variants 20, but you can lower this to increase sensitivity if needed.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R ref.fasta -I input.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -o output.vcf
+

7. Variant filtering

+

To filter the resulting callset, you will need to apply hard filters, as we do not yet have the RNAseq training/truth resources that would be needed to run variant recalibration (VQSR).

+

We recommend that you filter clusters of at least 3 SNPs that are within a window of 35 bases between them by adding -window 35 -cluster 3 to your command. This filter recommendation is specific for RNA-seq data.

+

As in DNA-seq, we recommend filtering based on Fisher Strand values (FS > 30.0) and Qual By Depth values (QD < 2.0).

+
java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf 
+

Please note that we selected these hard filtering values in attempting to optimize both high sensitivity and specificity together. By applying the hard filters, some real sites will get filtered. This is a tradeoff that each analyst should consider based on his/her own project. If you care more about sensitivity and are willing to tolerate more false positives calls, you can choose not to filter at all (or to use less restrictive thresholds).

+

An example of filtered (SNPs cluster filter) and unfiltered false variant calls:

+ +

An example of true variants that were filtered (false negatives). As explained in text, there is a tradeoff that comes with applying filters:

+ +
+

Known issues

+

There are a few known issues; one is that the allelic ratio is problematic. In many heterozygous sites, even if we can see in the RNAseq data both alleles that are present in the DNA, the ratio between the number of reads with the different alleles is far from 0.5, and thus the HaplotypeCaller (or any caller that expects a diploid genome) will miss that call. A DNA-aware mode of the caller might be able to fix such cases (which may be candidates also for downstream analysis of allele specific expression).

+

Although our new tool (splitNCigarReads) cleans many false positive calls that are caused by splicing inaccuracies by the aligners, we still call some false variants for that same reason, as can be seen in the example below. Some of those errors might be fixed in future versions of the pipeline with more sophisticated filters, with another realignment step in those regions, or by making the caller aware of splice positions.

+ + +

As stated previously, we will continue to improve the tools and process over time. We have plans to improve the splitting/clipping functionalities, improve true positive and minimize false positive rates, as well as developing statistical filtering (i.e. variant recalibration) recommendations.

+

We also plan to add functionality to process DNAseq and RNAseq data from the same samples simultaneously, in order to facilitate analyses of post-transcriptional processes. Future extensions to the HaplotypeCaller will provide this functionality, which will require both DNAseq and RNAseq in order to produce the best results. Finally, we are also looking at solutions for measuring differential expression of alleles.

+
+

[1] Pär G Engström et al. “Systematic evaluation of spliced alignment programs for RNA-seq data”. Nature Methods, 2013

+
+

NOTE: Questions about this document that were posted before June 2014 have been moved to this archival thread: http://gatkforums.broadinstitute.org/discussion/4709/questions-about-the-rnaseq-variant-discovery-workflow

\ No newline at end of file diff --git a/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md b/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md new file mode 100644 index 000000000..c8d3438c2 --- /dev/null +++ b/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md @@ -0,0 +1,29 @@ +## Calling variants on cohorts of samples using the HaplotypeCaller in GVCF mode + +http://gatkforums.broadinstitute.org/gatk/discussion/3893/calling-variants-on-cohorts-of-samples-using-the-haplotypecaller-in-gvcf-mode + +

This document describes the new approach to joint variant discovery that is available in GATK versions 3.0 and above. For a more detailed discussion of why it's better to perform joint discovery, see this FAQ article. For more details on how this fits into the overall reads-to-variants analysis workflow, see the Best Practices workflows documentation.

+

Overview

+

This is the workflow recommended in our Best Practices for performing variant discovery analysis on cohorts of samples.

+

+

In a nutshell, we now call variants individually on each sample using the HaplotypeCaller in -ERC GVCF mode, leveraging the previously introduced reference model to produce a comprehensive record of genotype likelihoods and annotations for each site in the genome (or exome), in the form of a gVCF file (genomic VCF).

+ +

In a second step, we then perform a joint genotyping analysis of the gVCFs produced for all samples in a cohort. +This allows us to achieve the same results as joint calling in terms of accurate genotyping results, without the computational nightmare of exponential runtimes, and with the added flexibility of being able to re-run the population-level genotyping analysis at any time as the available cohort grows.

+ +

This is meant to replace the joint discovery workflow that we previously recommended, which involved calling variants jointly on multiple samples, with a much smarter approach that reduces computational burden and solves the "N+1 problem".

+ +
+

Workflow details

+

This is a quick overview of how to apply the workflow in practice. For more details, see the Best Practices workflows documentation.

+

1. Variant calling

+

Run the HaplotypeCaller on each sample's BAM file(s) (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample gVCFs, with the option --emitRefConfidence GVCF, and using the .g.vcf extension for the output file.

+

Note that versions older than 3.4 require passing the options --variant_index_type LINEAR --variant_index_parameter 128000 to set the correct index strategy for the output gVCF.

+

2. Optional data aggregation step

+

If you have more than a few hundred samples, run CombineGVCFs on batches of ~200 gVCFs to hierarchically merge them into a single gVCF. This will make the next step more tractable.

+

3. Joint genotyping

+

Take the outputs from step 2 (or step 1 if dealing with fewer samples) and run GenotypeGVCFs on all of them together to create the raw SNP and indel VCFs that are usually emitted by the callers.

+

4. Variant recalibration

+

Finally, resume the classic GATK Best Practices workflow by running VQSR on these "regular" VCFs according to our usual recommendations.

+

That's it! Fairly simple in practice, but we predict this is going to have a huge impact in how people perform variant discovery in large cohorts. We certainly hope it helps people deal with the challenges posed by ever-growing datasets.

+

As always, we look forward to comments and observations from the research community!

\ No newline at end of file diff --git a/doc_archive/methods/Combining_variants_from_different_files_into_one.md b/doc_archive/methods/Combining_variants_from_different_files_into_one.md new file mode 100644 index 000000000..698afc96a --- /dev/null +++ b/doc_archive/methods/Combining_variants_from_different_files_into_one.md @@ -0,0 +1,74 @@ +## Combining variants from different files into one + +http://gatkforums.broadinstitute.org/gatk/discussion/53/combining-variants-from-different-files-into-one + +

Solutions for combining variant callsets depending on purpose

+

There are three main reasons why you might want to combine variants from different files into one, and the tool to use depends on what you are trying to achieve.

+
    +
  1. +

    The most common case is when you have been parallelizing your variant calling analyses, e.g. running HaplotypeCaller per-chromosome, producing separate VCF files (or gVCF files) per-chromosome. For that case, you can use a tool called CatVariants to concatenate the files. There are a few important requirements (e.g. the files should contain all the same samples, and distinct intervals) which you can read about on the tool's documentation page.

    +
  2. +
  3. +

    The second case is when you have been using HaplotypeCaller in -ERC GVCF or -ERC BP_RESOLUTION to call variants on a large cohort, producing many gVCF files. We recommend combining the output gVCF in batches of e.g. 200 before putting them through joint genotyping with GenotypeGVCFs (for performance reasons), which you can do using CombineGVCFs, which is specific for handling gVCF files.

    +
  4. +
  5. The third case is when you want to combine variant calls that were produced from the same samples but using different methods, for comparison. For example, if you're evaluating variant calls produced by different variant callers, different workflows, or the same but using different parameters. This produces separate callsets for the same samples, which are then easier to compare if you combine them into a single file. For that purpose, you can use CombineVariants, which is capable of merging VCF records intelligently, treating the same samples as separate or not as desired, combining annotations as appropriate. This is the case that requires the most preparation and forethought because there are many options that may be used to adapt the behavior of the tool.
  6. +
+

There is also one reason you might want to combine variants from different files into one, that we do not recommend following. That is, if you have produced variant calls from various samples separately, and want to combine them for analysis. This is how people used to do variant analysis on large numbers of samples, but we don't recommend proceeding this way because that workflow suffers from serious methodological flaws. Instead, you should follow our recommendations as laid out in the Best Practices documentation.

+
+

Merging records across VCFs with CombineVariants

+

Here we provide some more information and a worked out example to illustrate the third case because it is less straightforward than the other two.

+

A key point to understand is that CombineVariants will include a record at every site in all of your input VCF files, and annotate in which input callsets the record is present, pass, or filtered in in the set attribute in the INFO field (see below). In effect, CombineVariants always produces a union of the input VCFs. Any part of the Venn of the N merged VCFs can then be extracted specifically using JEXL expressions on the set attribute using SelectVariants. If you want to extract just the records in common between two VCFs, you would first CombineVariants the two files into a single VCF, and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out in the detailed example below.

+

Handling PASS/FAIL records at the same site in multiple input files

+

The -filteredRecordsMergeType argument determines how CombineVariants handles sites where a record is present in multiple VCFs, but it is filtered in some and unfiltered in others, as described in the tool documentation page linked above.

+

Understanding the set attribute

+

The set property of the INFO field indicates which call set the variant was found in. It can take on a variety of values indicating the exact nature of the overlap between the call sets. Note that the values are generalized for multi-way combinations, but here we describe only the values for 2 call sets being combined.

+ +

For three or more call sets combinations, you can see records like NAME1-NAME2 indicating a variant occurred in both NAME1 and NAME2 but not all sets.

+

You specify the NAME of a callset is by using the following syntax in your command line: -V:omni 1000G_omni2.5.b37.sites.vcf.

+

Emitting minimal VCF output

+

You can add the -minimalVCF argument to CombineVariants if you want to eliminate unnecessary information from the INFO field and genotypes. In that case, the only fields emitted will be GT:GQ for genotypes and the keySet for INFO.

+

An even more extreme output format is -sites_only (a general engine capability listed in the CommandLineGATK documentation) where the genotypes for all samples are completely stripped away from the output format. Enabling this option results in a significant performance speedup as well.

+

Requiring sites to be present in a minimum number of callsets

+

Sometimes you may want to combine several data sets but you only keep sites that are present in at least 2 of them. To do so, simply add the -minN (or --minimumN) command, followed by an integer if you want to only output records present in at least N input files. In our example, you would add -minN 2 to the command line.

+

Example: intersecting two VCFs

+

In the following example, we use CombineVariants and SelectVariants to obtain only the sites in common between the OMNI 2.5M and HapMap3 sites in the GSA bundle.

+
# combine the data
+java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T CombineVariants -R bundle/b37/human_g1k_v37.fasta -L 1:1-1,000,000 -V:omni bundle/b37/1000G_omni2.5.b37.sites.vcf -V:hm3 bundle/b37/hapmap_3.3.b37.sites.vcf -o union.vcf
+
+# select the intersection
+java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T SelectVariants -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -L 1:1-1,000,000 -V:variant union.vcf -select 'set == "Intersection";' -o intersect.vcf
+

This results in two vcf files, which look like:

+
# contents of union.vcf
+1       990839  SNP1-980702     C       T       .       PASS    AC=150;AF=0.05384;AN=2786;CR=100.0;GentrainScore=0.7267;HW=0.0027632264;set=Intersection
+1       990882  SNP1-980745     C       T       .       PASS    CR=99.79873;GentrainScore=0.7403;HW=0.005225421;set=omni
+1       990984  SNP1-980847     G       A       .       PASS    CR=99.76005;GentrainScore=0.8406;HW=0.26163524;set=omni
+1       992265  SNP1-982128     C       T       .       PASS    CR=100.0;GentrainScore=0.7412;HW=0.0025895447;set=omni
+1       992819  SNP1-982682     G       A       .       id50    CR=99.72961;GentrainScore=0.8505;HW=4.811053E-17;set=FilteredInAll
+1       993987  SNP1-983850     T       C       .       PASS    CR=99.85935;GentrainScore=0.8336;HW=9.959717E-28;set=omni
+1       994391  rs2488991       G       T       .       PASS    AC=1936;AF=0.69341;AN=2792;CR=99.89378;GentrainScore=0.7330;HW=1.1741E-41;set=filterInomni-hm3
+1       996184  SNP1-986047     G       A       .       PASS    CR=99.932205;GentrainScore=0.8216;HW=3.8830226E-6;set=omni
+1       998395  rs7526076       A       G       .       PASS    AC=2234;AF=0.80187;AN=2786;CR=100.0;GentrainScore=0.8758;HW=0.67373306;set=Intersection
+1       999649  SNP1-989512     G       A       .       PASS    CR=99.93262;GentrainScore=0.7965;HW=4.9767335E-4;set=omni
+
+# contents of intersect.vcf
+1       950243  SNP1-940106     A       C       .       PASS    AC=826;AF=0.29993;AN=2754;CR=97.341675;GentrainScore=0.7311;HW=0.15148845;set=Intersection
+1       957640  rs6657048       C       T       .       PASS    AC=127;AF=0.04552;AN=2790;CR=99.86667;GentrainScore=0.6806;HW=2.286109E-4;set=Intersection
+1       959842  rs2710888       C       T       .       PASS    AC=654;AF=0.23559;AN=2776;CR=99.849;GentrainScore=0.8072;HW=0.17526293;set=Intersection
+1       977780  rs2710875       C       T       .       PASS    AC=1989;AF=0.71341;AN=2788;CR=99.89077;GentrainScore=0.7875;HW=2.9912625E-32;set=Intersection
+1       985900  SNP1-975763     C       T       .       PASS    AC=182;AF=0.06528;AN=2788;CR=99.79926;GentrainScore=0.8374;HW=0.017794203;set=Intersection
+1       987200  SNP1-977063     C       T       .       PASS    AC=1956;AF=0.70007;AN=2794;CR=99.45917;GentrainScore=0.7914;HW=1.413E-42;set=Intersection
+1       987670  SNP1-977533     T       G       .       PASS    AC=2485;AF=0.89196;AN=2786;CR=99.51427;GentrainScore=0.7005;HW=0.24214932;set=Intersection
+1       990417  rs2465136       T       C       .       PASS    AC=1113;AF=0.40007;AN=2782;CR=99.7599;GentrainScore=0.8750;HW=8.595538E-5;set=Intersection
+1       990839  SNP1-980702     C       T       .       PASS    AC=150;AF=0.05384;AN=2786;CR=100.0;GentrainScore=0.7267;HW=0.0027632264;set=Intersection
+1       998395  rs7526076       A       G       .       PASS    AC=2234;AF=0.80187;AN=2786;CR=100.0;GentrainScore=0.8758;HW=0.67373306;set=Intersection
\ No newline at end of file diff --git a/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md b/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md new file mode 100644 index 000000000..550ced427 --- /dev/null +++ b/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md @@ -0,0 +1,109 @@ +## Evaluating the quality of a variant callset + +http://gatkforums.broadinstitute.org/gatk/discussion/6308/evaluating-the-quality-of-a-variant-callset + +

Introduction

+

Running through the steps involved in variant discovery (calling variants, joint genotyping and applying filters) produces a variant callset in the form of a VCF file. So what’s next? Technically, that callset is ready to be used in downstream analysis. But before you do that, we recommend running some quality control analyses to evaluate how “good” that callset is.

+

To be frank, distinguishing between a “good” callset and a “bad” callset is a complex problem. If you knew the absolute truth of what variants are present or not in your samples, you probably wouldn’t be here running variant discovery on some high-throughput sequencing data. Your fresh new callset is your attempt to discover that truth. So how do you know how close you got?

+

Methods for variant evaluation

+

There are several methods that you can apply which offer different insights into the probable biological truth, all with their own pros and cons. Possibly the most trusted method is Sanger sequencing of regions surrounding putative variants. However, it is also the least scalable as it would be prohibitively costly and time-consuming to apply to an entire callset. Typically, Sanger sequencing is only applied to validate candidate variants that are judged highly likely. Another popular method is to evaluate concordance against results obtained from a genotyping chip run on the same samples. This is much more scalable, and conveniently also doubles as a quality control method to detect sample swaps. Although it only covers the subset of known variants that the chip was designed for, this method can give you a pretty good indication of both sensitivity (ability to detect true variants) and specificity (not calling variants where there are none). This is something we do systematically for all samples in the Broad’s production pipelines.

+

The third method, presented here, is to evaluate how your variant callset stacks up against another variant callset (typically derived from other samples) that is considered to be a truth set (sometimes referred to as a gold standard -- these terms are very close and often used interchangeably). The general idea is that key properties of your callset (metrics discussed later in the text) should roughly match those of the truth set. This method is not meant to render any judgments about the veracity of individual variant calls; instead, it aims to estimate the overall quality of your callset and detect any red flags that might be indicative of error.

+

Underlying assumptions and truthiness*: a note of caution

+

It should be immediately obvious that there are two important assumptions being made here: 1) that the content of the truth set has been validated somehow and is considered especially trustworthy; and 2) that your samples are expected to have similar genomic content as the population of samples that was used to produce the truth set. These assumptions are not always well-supported, depending on the truth set, your callset, and what they have (or don’t have) in common. You should always keep this in mind when choosing a truth set for your evaluation; it’s a jungle out there. Consider that if anyone can submit variants to a truth set’s database without a well-regulated validation process, and there is no process for removing variants if someone later finds they were wrong (I’m looking at you, dbSNP), you should be extra cautious in interpreting results. +*With apologies to Stephen Colbert.

+

Validation

+

So what constitutes validation? Well, the best validation is done with orthogonal methods, meaning that it is done with technology (wetware, hardware, software, etc.) that is not subject to the same error modes as the sequencing process. Calling variants with two callers that use similar algorithms? Great way to reinforce your biases. It won’t mean anything that both give the same results; they could both be making the same mistakes. On the wetlab side, Sanger and genotyping chips are great validation tools; the technology is pretty different, so they tend to make different mistakes. Therefore it means more if they agree or disagree with calls made from high-throughput sequencing.

+

Matching populations

+

Regarding the population genomics aspect: it’s complicated -- especially if we’re talking about humans (I am). There’s a lot of interesting literature on this topic; for now let’s just summarize by saying that some important variant calling metrics vary depending on ethnicity. So if you are studying a population with a very specific ethnic composition, you should try to find a truth set composed of individuals with a similar ethnic background, and adjust your expectations accordingly for some metrics.

+

Similar principles apply to non-human genomic data, with important variations depending on whether you’re looking at wild or domesticated populations, natural or experimentally manipulated lineages, and so on. Unfortunately we can’t currently provide any detailed guidance on this topic, but hopefully this explanation of the logic and considerations involved will help you formulate a variant evaluation strategy that is appropriate for your organism of interest.

+
+

Variant evaluation metrics

+

So let’s say you’ve got your fresh new callset and you’ve found an appropriate truth set. You’re ready to look at some metrics (but don’t worry yet about how; we’ll get to that soon enough). There are several metrics that we recommend examining in order to evaluate your data. The set described here should be considered a minimum and is by no means exclusive. It is nearly always better to evaluate more metrics if you possess the appropriate data to do so -- and as long as you understand why those additional metrics are meaningful. Please don’t try to use metrics that you don’t understand properly, because misunderstandings lead to confusion; confusion leads to worry; and worry leads to too many desperate posts on the GATK forum.

+

Variant-level concordance and genotype concordance

+

The relationship between variant-level concordance and genotype concordance is illustrated in this figure.

+ +

Number of Indels & SNPs and TiTv Ratio

+

These metrics are widely applicable. The table below summarizes their expected value ranges for Human Germline Data:

+ + + + + + + + + + + + + + + + + + + + +
Sequencing Type# of Variants*TiTv Ratio
WGS~4.4M2.0-2.1
WES~41k3.0-3.3
+

*for a single sample

+ +

Ratio of Insertions to Deletions (Indel Ratio)

+

This metric is generally evaluated after filtering for purposes that are specific to your study, and the expected value range depends on whether you're looking for rare or common variants, as summarized in the table below.

+ + + + + + + + + + + + + + + + + +
Filtering forIndel Ratio
common~1
rare0.2-0.5
+

A significant deviation from the expected ratios listed in the table above could indicate a bias resulting from artifactual variants.

+
+

Tools for performing variant evaluation

+

VariantEval

+

This is the GATK’s main tool for variant evaluation. It is designed to collect and calculate a variety of callset metrics that are organized in evaluation modules, which are listed in the tool doc. For each evaluation module that is enabled, the tool will produce a table containing the corresponding callset metrics based on the specified inputs (your callset of interest and one or more truth sets). By default, VariantEval will run with a specific subset of the available modules (listed below), but all evaluation modules can be enabled or disabled from the command line. We recommend setting the tool to produce only the metrics that you are interested in, because each active module adds to the computational requirements and overall runtime of the tool.

+

It should be noted that all module calculations only include variants that passed filtering (i.e. FILTER column in your vcf file should read PASS); variants tagged as filtered out will be ignored. It is not possible to modify this behavior. See the example analysis for more details on how to use this tool and interpret its output.

+

GenotypeConcordance

+

This tool calculates -- you’ve guessed it -- the genotype concordance between callsets. In earlier versions of GATK, GenotypeConcordance was itself a module within VariantEval. It was converted into a standalone tool to enable more complex genotype concordance calculations.

+

Picard tools

+

The Picard toolkit includes two tools that perform similar functions to VariantEval and GenotypeConcordance, respectively called CollectVariantCallingMetrics and GenotypeConcordance. Both are relatively lightweight in comparison to their GATK equivalents; their functionalities are more limited, but they do run quite a bit faster. See the example analysis of CollectVariantCallingMetrics for details on its use and data interpretation. Note that in the coming months, the Picard tools are going to be integrated into the next major version of GATK, so at that occasion we plan to consolidate these two pairs of homologous tools to eliminate redundancy.

+

Which tool should I use?

+

We recommend Picard's version of each tool for most cases. The GenotypeConcordance tools provide mostly the same information, but Picard's version is preferred by Broadies. Both VariantEval and CollectVariantCallingMetrics produce similar metrics, however the latter runs faster and is scales better for larger cohorts. By default, CollectVariantCallingMetrics stratifies by sample, allowing you to see the value of relevant statistics as they pertain to specific samples in your cohort. It includes all metrics discussed here, as well as a few more. On the other hand, VariantEval provides many more metrics beyond the minimum described here for analysis. It should be noted that none of these tools use phasing to determine metrics.

+

So when should I use CollectVariantCallingMetrics?

+ +

When should I use VariantEval?

+ \ No newline at end of file diff --git a/doc_archive/methods/Genotype_Refinement_workflow.md b/doc_archive/methods/Genotype_Refinement_workflow.md new file mode 100644 index 000000000..8a0cde208 --- /dev/null +++ b/doc_archive/methods/Genotype_Refinement_workflow.md @@ -0,0 +1,76 @@ +## Genotype Refinement workflow + +http://gatkforums.broadinstitute.org/gatk/discussion/4723/genotype-refinement-workflow + +

Overview

+

This document describes the purpose and general principles of the Genotype Refinement workflow. For the mathematical details of the methods involved, please see the Genotype Refinement math documentation. For step-by-step instructions on how to apply this workflow to your data, please see the Genotype Refinement tutorial.

+
+

1. Introduction

+

The core GATK Best Practices workflow has historically focused on variant discovery --that is, the existence of genomic variants in one or more samples in a cohorts-- and consistently delivers high quality results when applied appropriately. However, we know that the quality of the individual genotype calls coming out of the variant callers can vary widely based on the quality of the BAM data for each sample. The goal of the Genotype Refinement workflow is to use additional data to improve the accuracy of genotype calls and to filter genotype calls that are not reliable enough for downstream analysis. In this sense it serves as an optional extension of the variant calling workflow, intended for researchers whose work requires high-quality identification of individual genotypes.

+

A few commonly asked questions are:

+

What studies can benefit from the Genotype Refinement workflow?

+

While every study can benefit from increased data accuracy, this workflow is especially useful for analyses that are concerned with how many copies of each variant an individual has (e.g. in the case of loss of function) or with the transmission (or de novo origin) of a variant in a family.

+

What additional data do I need to run the Genotype Refinement workflow?

+

If a “gold standard” dataset for SNPs is available, that can be used as a very powerful set of priors on the genotype likelihoods in your data. For analyses involving families, a pedigree file describing the relatedness of the trios in your study will provide another source of supplemental information. If neither of these applies to your data, the samples in the dataset itself can provide some degree of genotype refinement (see section 5 below for details).

+

Is the Genotype Refinement workflow going to change my data? Can I still use my old analysis pipeline?

+

After running the Genotype Refinement workflow, several new annotations will be added to the INFO and FORMAT fields of your variants (see below), GQ fields will be updated, and genotype calls may be modified. However, the Phred-scaled genotype likelihoods (PLs) which indicate the original genotype call (the genotype candidate with PL=0) will remain untouched. Any analysis that made use of the PLs will produce the same results as before.

+
+

2. The Genotype Refinement workflow

+

Overview

+ +

Input

+

Begin with recalibrated variants from VQSR at the end of the best practices pipeline. The filters applied by VQSR will be carried through the Genotype Refinement workflow.

+

Step 1: Derive posterior probabilities of genotypes

+

Tool used: CalculateGenotypePosteriors

+

Using the Phred-scaled genotype likelihoods (PLs) for each sample, prior probabilities for a sample taking on a HomRef, Het, or HomVar genotype are applied to derive the posterior probabilities of the sample taking on each of those genotypes. A sample’s PLs were calculated by HaplotypeCaller using only the reads for that sample. By introducing additional data like the allele counts from the 1000 Genomes project and the PLs for other individuals in the sample’s pedigree trio, those estimates of genotype likelihood can be improved based on what is known about the variation of other individuals.

+

SNP calls from the 1000 Genomes project capture the vast majority of variation across most human populations and can provide very strong priors in many cases. At sites where most of the 1000 Genomes samples are homozygous variant with respect to the reference genome, the probability of a sample being analyzed of also being homozygous variant is very high.

+

For a sample for which both parent genotypes are available, the child’s genotype can be supported or invalidated by the parents’ genotypes based on Mendel’s laws of allele transmission. Even the confidence of the parents’ genotypes can be recalibrated, such as in cases where the genotypes output by HaplotypeCaller are apparent Mendelian violations.

+

Step 2: Filter low quality genotypes

+

Tool used: VariantFiltration

+

After the posterior probabilities are calculated for each sample at each variant site, genotypes with GQ < 20 based on the posteriors are filtered out. GQ20 is widely accepted as a good threshold for genotype accuracy, indicating that there is a 99% chance that the genotype in question is correct. Tagging those low quality genotypes indicates to researchers that these genotypes may not be suitable for downstream analysis. However, as with the VQSR, a filter tag is applied, but the data is not removed from the VCF.

+

Step 3: Annotate possible de novo mutations

+

Tool used: VariantAnnotator

+

Using the posterior genotype probabilities, possible de novo mutations are tagged. Low confidence de novos have child GQ >= 10 and AC < 4 or AF < 0.1%, whichever is more stringent for the number of samples in the dataset. High confidence de novo sites have all trio sample GQs >= 20 with the same AC/AF criterion.

+

Step 4: Functional annotation of possible biological effects

+

Tool options: SnpEff or Oncotator (both are non-GATK tools)

+

Especially in the case of de novo mutation detection, analysis can benefit from the functional annotation of variants to restrict variants to exons and surrounding regulatory regions. The GATK currently does not feature integration with any functional annotation tool, but SnpEff and Oncotator are useful utilities that can work with the GATK's VCF output.

+
+

3. Output annotations

+

The Genotype Refinement Pipeline adds several new info- and format-level annotations to each variant. GQ fields will be updated, and genotypes calculated to be highly likely to be incorrect will be changed. The Phred-scaled genotype likelihoods (PLs) carry through the pipeline without being changed. In this way, PLs can be used to derive the original genotypes in cases where sample genotypes were changed.

+

Population Priors

+

New INFO field annotation PG is a vector of the Phred-scaled prior probabilities of a sample at that site being HomRef, Het, and HomVar. These priors are based on the input samples themselves along with data from the supporting samples if the variant in question overlaps another in the supporting dataset.

+

Phred-Scaled Posterior Probability

+

New FORMAT field annotation PP is the Phred-scaled posterior probability of the sample taking on each genotype for the given variant context alleles. The PPs represent a better calibrated estimate of genotype probabilities than the PLs are recommended for use in further analyses instead of the PLs.

+

Genotype Quality

+

Current FORMAT field annotation GQ is updated based on the PPs. The calculation is the same as for GQ based on PLs.

+

Joint Trio Likelihood

+

New FORMAT field annotation JL is the Phred-scaled joint likelihood of the posterior genotypes for the trio being incorrect. This calculation is based on the PLs produced by HaplotypeCaller (before application of priors), but the genotypes used come from the posteriors. The goal of this annotation is to be used in combination with JP to evaluate the improvement in the overall confidence in the trio’s genotypes after applying CalculateGenotypePosteriors. The calculation of the joint likelihood is given as:

+

$$ -10\log ( 1-GL_{mother}[\text{Posterior mother GT}] GL{father}[\text{Posterior father GT}] * GL{child}[\text{Posterior child GT}] ) $$

+

where the GLs are the genotype likelihoods in [0, 1] probability space.

+

Joint Trio Posterior

+

New FORMAT field annotation JP is the Phred-scaled posterior probability of the output posterior genotypes for the three samples being incorrect. The calculation of the joint posterior is given as:

+

$$ -10\log (1-GP_{mother}[\text{Posterior mother GT}] GP{father}[\text{Posterior father GT}] * GP{child}[\text{Posterior child GT}] )$$

+

where the GPs are the genotype posteriors in [0, 1] probability space.

+

Low Genotype Quality

+

New FORMAT field filter lowGQ indicates samples with posterior GQ less than 20. Filtered samples tagged with lowGQ are not recommended for use in downstream analyses.

+

High and Low Confidence De Novo

+

New INFO field annotation for sites at which at least one family has a possible de novo mutation. Following the annotation tag is a list of the children with de novo mutations. High and low confidence are output separately.

+
+

4. Example

+

Before:

+
1       1226231 rs13306638      G       A       167563.16       PASS    AC=2;AF=0.333;AN=6;…        GT:AD:DP:GQ:PL  0/0:11,0:11:0:0,0,249   0/0:10,0:10:24:0,24,360 1/1:0,18:18:60:889,60,0
+

After:

+
1       1226231 rs13306638      G       A       167563.16       PASS    AC=3;AF=0.500;AN=6;…PG=0,8,22;…    GT:AD:DP:GQ:JL:JP:PL:PP 0/1:11,0:11:49:2:24:0,0,249:49,0,287    0/0:10,0:10:32:2:24:0,24,360:0,32,439   1/1:0,18:18:43:2:24:889,60,0:867,43,0
+

The original call for the child (first sample) was HomRef with GQ0. However, given that, with high confidence, one parent is HomRef and one is HomVar, we expect the child to be heterozygous at this site. After family priors are applied, the child’s genotype is corrected and its GQ is increased from 0 to 49. Based on the allele frequency from 1000 Genomes for this site, the somewhat weaker population priors favor a HomRef call (PG=0,8,22). The combined effect of family and population priors still favors a Het call for the child.

+

The joint likelihood for this trio at this site is two, indicating that the genotype for one of the samples may have been changed. Specifically a low JL indicates that posterior genotype for at least one of the samples was not the most likely as predicted by the PLs. The joint posterior value for the trio is 24, which indicates that the GQ values based on the posteriors for all of the samples are at least 24. (See above for a more complete description of JL and JP.)

+
+

5. More information about priors

+

The Genotype Refinement Pipeline uses Bayes’s Rule to combine independent data with the genotype likelihoods derived from HaplotypeCaller, producing more accurate and confident genotype posterior probabilities. Different sites will have different combinations of priors applied based on the overlap of each site with external, supporting SNP calls and on the availability of genotype calls for the samples in each trio.

+

Input-derived Population Priors

+

If the input VCF contains at least 10 samples, then population priors will be calculated based on the discovered allele count for every called variant.

+

Supporting Population Priors

+

Priors derived from supporting SNP calls can only be applied at sites where the supporting calls overlap with called variants in the input VCF. The values of these priors vary based on the called reference and alternate allele counts in the supporting VCF. Higher allele counts (for ref or alt) yield stronger priors.

+

Family Priors

+

The strongest family priors occur at sites where the called trio genotype configuration is a Mendelian violation. In such a case, each Mendelian violation configuration is penalized by a de novo mutation probability (currently 10-6). Confidence also propagates through a trio. For example, two GQ60 HomRef parents can substantially boost a low GQ HomRef child and a GQ60 HomRef child and parent can improve the GQ of the second parent. Application of family priors requires the child to be called at the site in question. If one parent has a no-call genotype, priors can still be applied, but the potential for confidence improvement is not as great as in the 3-sample case.

+

Caveats

+

Right now family priors can only be applied to biallelic variants and population priors can only be applied to SNPs. Family priors only work for trios.

\ No newline at end of file diff --git a/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md b/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md new file mode 100644 index 000000000..aaaf87611 --- /dev/null +++ b/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md @@ -0,0 +1,30 @@ +## Genotype Refinement workflow: mathematical details + +http://gatkforums.broadinstitute.org/gatk/discussion/4726/genotype-refinement-workflow-mathematical-details + +

Overview

+

This document describes the mathematical details of the methods involved in the Genotype Refinement workflow. For an explanation of the purpose and general principles involved in this workflow, please see the main Genotype Refinement workflow article. For step-by-step instructions on how to apply this workflow to your data, please see the Genotype Refinement tutorial.

+
+

1. Review of Bayes’s Rule

+

HaplotypeCaller outputs the likelihoods of observing the read data given that the genotype is actually HomRef, Het, and HomVar. To convert these quantities to the probability of the genotype given the read data, we can use Bayes’s Rule. Bayes’s Rule dictates that the probability of a parameter given observed data is equal to the likelihood of the observations given the parameter multiplied by the prior probability that the parameter takes on the value of interest, normalized by the prior times likelihood for all parameter values:

+

$$ P(\theta|Obs) = \frac{P(Obs|\theta)P(\theta)}{\sum_{\theta} P(Obs|\theta)P(\theta)} $$

+

In the best practices pipeline, we interpret the genotype likelihoods as probabilities by implicitly converting the genotype likelihoods to genotype probabilities using non-informative or flat priors, for which each genotype has the same prior probability. However, in the Genotype Refinement Pipeline we use independent data such as the genotypes of the other samples in the dataset, the genotypes in a “gold standard” dataset, or the genotypes of the other samples in a family to construct more informative priors and derive better posterior probability estimates.

+
+

2. Calculation of Population Priors

+

Given a set of samples in addition to the sample of interest (ideally non-related, but from the same ethnic population), we can derive the prior probability of the genotype of the sample of interest by modeling the sample’s alleles as two independent draws from a pool consisting of the set of all the supplemental samples’ alleles. (This follows rather naturally from the Hardy-Weinberg assumptions.) Specifically, this prior probability will take the form of a multinomial Dirichlet distribution parameterized by the allele counts of each allele in the supplemental population. In the biallelic case the priors can be calculated as follows:

+

$$ P(GT = HomRef) = \dbinom{2}{0} \ln \frac{\Gamma(nSamples)\Gamma(RefCount + 2)}{\Gamma(nSamples + 2)\Gamma(RefCount)} $$

+

$$ P(GT = Het) = \dbinom{2}{1} \ln \frac{\Gamma(nSamples)\Gamma(RefCount + 1)\Gamma(AltCount + 1)}{\Gamma(nSamples + 2)\Gamma(RefCount)\Gamma(AltCount)} $$

+

$$ P(GT = HomVar) = \dbinom{2}{2} \ln \frac{\Gamma(nSamples)\Gamma(AltCount + 2)}{\Gamma(nSamples + 2)\Gamma(AltCount)} $$

+

where Γ is the Gamma function, an extension of the factorial function.

+

The prior genotype probabilities based on this distribution scale intuitively with number of samples. For example, a set of 10 samples, 9 of which are HomRef yield a prior probability of another sample being HomRef with about 90% probability whereas a set of 50 samples, 49 of which are HomRef yield a 97% probability of another sample being HomRef.

+
+

3. Calculation of Family Priors

+

Given a genotype configuration for a given mother, father, and child trio, we set the prior probability of that genotype configuration as follows:

+

$$ P(G_M,G_F,G_C) = P(\vec{G}) \cases{ 1-10\mu-2\mu^2 & no MV \cr \mu & 1 MV \cr \mu^2 & 2 MVs} $$

+

where the 10 configurations with a single Mendelian violation are penalized by the de novo mutation probability μ and the two configurations with two Mendelian violations by μ^2. The remaining configurations are considered valid and are assigned the remaining probability to sum to one.

+

This prior is applied to the joint genotype combination of the three samples in the trio. To find the posterior for any single sample, we marginalize over the remaining two samples as shown in the example below to find the posterior probability of the child having a HomRef genotype:

+

$$ P(G_C = HomRef | \vec{D}) = \frac{L_C(GC = HomRef) \sum{G_F,G_M} L_F(G_F)L_M(GM)P(\vec{G})}{\sum{\vec{H}}P(\vec{D}|\vec{H})P(\vec{H})} $$

+

This quantity P(Gc|D) is calculated for each genotype, then the resulting vector is Phred-scaled and output as the Phred-scaled posterior probabilities (PPs).

+
+

4. Order of the workflow

+

Family priors are calculated and applied before population priors. The opposite ordering results in overly strong population priors because they are applied to the child and parents and then compounded when the trio likelihoods are multiplied together.

\ No newline at end of file diff --git a/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md b/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md new file mode 100644 index 000000000..c64441da1 --- /dev/null +++ b/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md @@ -0,0 +1,39 @@ +## HC overview: How the HaplotypeCaller works + +http://gatkforums.broadinstitute.org/gatk/discussion/4148/hc-overview-how-the-haplotypecaller-works + +

This document describes the methods involved in variant calling as performed by the HaplotypeCaller. Please note that we are still working on producing supporting figures to help explain the sometimes complex operations involved.

+

Overview

+

The core operations performed by HaplotypeCaller can be grouped into these major steps:

+ +

1. Define active regions. The program determines which regions of the genome it needs to operate on, based on the presence of significant evidence for variation.

+

2. Determine haplotypes by re-assembly of the active region. For each ActiveRegion, the program builds a De Bruijn-like graph to reassemble the ActiveRegion and identifies what are the possible haplotypes present in the data. The program then realigns each haplotype against the reference haplotype using the Smith-Waterman algorithm in order to identify potentially variant sites.

+

3. Determine likelihoods of the haplotypes given the read data. For each ActiveRegion, the program performs a pairwise alignment of each read against each haplotype using the PairHMM algorithm. This produces a matrix of likelihoods of haplotypes given the read data. These likelihoods are then marginalized to obtain the likelihoods of alleles per read for each potentially variant site.

+

4. Assign sample genotypes. For each potentially variant site, the program applies Bayes’ rule, using the likelihoods of alleles given the read data to calculate the posterior likelihoods of each genotype per sample given the read data observed for that sample. The most likely genotype is then assigned to the sample.

+
+

1. Define active regions

+

In this first step, the program traverses the sequencing data to identify regions of the genomes in which the samples being analyzed show substantial evidence of variation relative to the reference. The resulting areas are defined as “active regions”, and will be passed on to the next step. Areas that do not show any variation beyond the expected levels of background noise will be skipped in the next step. This aims to accelerate the analysis by not wasting time performing reassembly on regions that are identical to the reference anyway.

+

To define these active regions, the program operates in three phases. First, it computes an activity score for each individual genome position, yielding the raw activity profile, which is a wave function of activity per position. Then, it applies a smoothing algorithm to the raw profile, which is essentially a sort of averaging process, to yield the actual activity profile. Finally, it identifies local maxima where the activity profile curve rises above the preset activity threshold, and defines appropriate intervals to encompass the active profile within the preset size constraints. For more details on how the activity profile is computed and processed, as well as what options are available to modify the active region parameters, please see this method article.

+

Note that the process for determining active region intervals is modified slightly when HaplotypeCaller is run in one of the special modes, e.g. the reference confidence mode (-ERC GVCF or ERC BP_RESOLUTION), Genotype Given Alleles (-gt_mode GENOTYPE_GIVEN_ALLELES) or when active regions are triggered using advanced arguments such as -allelesTrigger, --forceActive or --activeRegionIn. This is covered in the method article referenced above.

+

Once this process is complete, the program applies a few post-processing steps to finalize the the active regions (see detailed doc above). The final output of this process is a list of intervals corresponding to the active regions which will be processed in the next step.

+
+

2. Determine haplotypes by re-assembly of the active region.

+

The goal of this step is to reconstruct the possible sequences of the real physical segments of DNA present in the original sample organism. To do this, the program goes through each active region and uses the input reads that mapped to that region to construct complete sequences covering its entire length, which are called haplotypes. This process will typically generate several different possible haplotypes for each active region due to:

+ +

In order to generate a list of possible haplotypes, the program first builds an assembly graph for the active region using the reference sequence as a template. Then, it takes each read in turn and attempts to match it to a segment of the graph. Whenever portions of a read do not match the local graph, the program adds new nodes to the graph to account for the mismatches. After this process has been repeated with many reads, it typically yields a complex graph with many possible paths. However, because the program keeps track of how many reads support each path segment, we can select only the most likely (well-supported) paths. These likely paths are then used to build the haplotype sequences which will be used for scoring and genotyping in the next step.

+

The assembly and haplotype determination procedure is described in full detail in this method article.

+

Once the haplotypes have been determined, each one is realigned against the original reference sequence in order to identify potentially variant sites. This produces the set of sites that will be processed in the next step. A subset of these sites will eventually be emitted as variant calls to the output VCF.

+
+

3. Evaluating the evidence for haplotypes and variant alleles

+

Now that we have all these candidate haplotypes, we need to evaluate how much evidence there is in the data to support each one of them. So the program takes each individual read and aligns it against each haplotype in turn (including the reference haplotype) using the PairHMM algorithm, which takes into account the information we have about the quality of the data (i.e. the base quality scores and indel quality scores). This outputs a score for each read-haplotype pairing, expressing the likelihood of observing that read given that haplotype.

+

Those scores are then used to calculate out how much evidence there is for individual alleles at the candidate sites that were identified in the previous step. The process is called marginalization over alleles and produces the actual numbers that will finally be used to assign a genotype to the sample in the next step.

+

For further details on the pairHMM output and the marginalization process, see this document.

+
+

4. Assigning per-sample genotypes

+

The previous step produced a table of per-read allele likelihoods for each candidate variant site under consideration. Now, all that remains to do is to evaluate those likelihoods in aggregate to determine what is the most likely genotype of the sample at each site. This is done by applying Bayes' theorem to calculate the likelihoods of each possible genotype, and selecting the most likely. This produces a genotype call as well as the calculation of various metrics that will be annotated in the output VCF if a variant call is emitted.

+

For further details on the genotyping calculations, see this document.

+

This concludes the overview of how HaplotypeCaller works.

\ No newline at end of file diff --git a/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md b/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md new file mode 100644 index 000000000..4ca6274fa --- /dev/null +++ b/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md @@ -0,0 +1,54 @@ +## HC step 1: Defining ActiveRegions by measuring data entropy + +http://gatkforums.broadinstitute.org/gatk/discussion/4147/hc-step-1-defining-activeregions-by-measuring-data-entropy + +

This document describes the procedure used by HaplotypeCaller to define ActiveRegions on which to operate as a prelude to variant calling. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general HaplotypeCaller documentation.

+

Summary

+

To define active regions, the HaplotypeCaller operates in three phases. First, it computes an activity score for each individual genome position, yielding the raw activity profile, which is a wave function of activity per position. Then, it applies a smoothing algorithm to the raw profile, which is essentially a sort of averaging process, to yield the actual activity profile. Finally, it identifies local maxima where the activity profile curve rises above the preset activity threshold, and defines appropriate intervals to encompass the active profile within the preset size constraints.

+
+

1. Calculating the raw activity profile

+

Active regions are determined by calculating a profile function that characterizes “interesting” regions likely to contain variants. The raw profile is first calculated locus by locus.

+

In the normal case (no special mode is enabled) the per-position score is the probability that the position contains a variant as calculated using the reference-confidence model applied to the original alignment.

+

If using the mode for genotyping given alleles (GGA) or the advanced-level flag -useAlleleTrigger, and the site is overlapped by an allele in the VCF file provided through the -alleles argument, the score is set to 1. If the position is not covered by a provided allele, the score is set to 0.

+

This operation gives us a single raw value for each position on the genome (or within the analysis intervals requested using the -L argument).

+
+

2. Smoothing the activity profile

+

The final profile is calculated by smoothing this initial raw profile following three steps. The first two steps consist in spreading individual position raw profile values to contiguous bases. As a result each position will have more than one raw profile value that are added up in the third and last step to obtain a final unique and smoothed value per position.

+
    +
  1. +

    Unless one of the special modes is enabled (GGA or allele triggering), the position profile value will be copied over to adjacent regions if enough high quality soft-clipped bases immediately precede or follow that position in the original alignment. At time of writing, high-quality soft-clipped bases are those with quality score of Q29 or more. We consider that there are enough of such a soft-clips when the average number of high quality bases per soft-clip is 7 or more. In this case the site profile value is copied to all bases within a radius of that position as large as the average soft-clip length without exceeding a maximum of 50bp.

    +
  2. +
  3. +

    Each profile value is then divided and spread out using a Gaussian kernel covering up to 50bp radius centered at its current position with a standard deviation, or sigma, set using the -bandPassSigma argument (current default is 17 bp). The larger the sigma, the broader the spread will be.

    +
  4. +
  5. For each position, the final smoothed value is calculated as the sum of all its profile values after steps 1 and 2.
  6. +
+
+

3. Setting the ActiveRegion thresholds and intervals

+

The resulting profile line is cut in regions where it crosses the non-active to active threshold (currently set to 0.002). Then we make some adjustments to these boundaries so that those regions that are to be considered active, with a profile running over that threshold, fall within the minimum (fixed to 50bp) and maximum region size (customizable using -activeRegionMaxSize).

+ +

Of the resulting regions, those with a profile that runs over this threshold are considered active regions and progress to variant discovery and or calling whereas regions whose profile runs under the threshold are considered inactive regions and are discarded except if we are running HC in reference confidence mode.

+

There is a final post-processing step to clean up and trim the ActiveRegion:

+ \ No newline at end of file diff --git a/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md b/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md new file mode 100644 index 000000000..5948c8074 --- /dev/null +++ b/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md @@ -0,0 +1,35 @@ +## HC step 2: Local re-assembly and haplotype determination + +http://gatkforums.broadinstitute.org/gatk/discussion/4146/hc-step-2-local-re-assembly-and-haplotype-determination + +

This document details the procedure used by HaplotypeCaller to re-assemble read data and determine candidate haplotypes as a prelude to variant calling. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general HaplotypeCaller documentation.

+

Note that we are still working on producing figures to complement the text. We will update this document as soon as the figures are ready. Note also that this is a provisional document and some final corrections may be made for accuracy and/or completeness. Feedback is most welcome!

+
+

Overview

+

The previous step produced a list of ActiveRegions that showed some evidence of possible variation (see step 1 documentation). Now, we need to process each Active Region in order to generate a list of possible haplotypes based on the sequence data we have for that region.

+

To do so, the program first builds an assembly graph for each active region (determined in the previous step) using the reference sequence as a template. Then, it takes each read in turn and attempts to match it to a segment of the graph. Whenever portions of a read do not match the local graph, the program adds new nodes to the graph to account for the mismatches. After this process has been repeated with many reads, it typically yields a complex graph with many possible paths. However, because the program keeps track of how many reads support each path segment, we can select only the most likely (well-supported) paths. These likely paths are then used to build the haplotype sequences which will be used to call variants and assign per-sample genotypes in the next steps.

+
+

1. Reference graph assembly

+

First, we construct the reference assembly graph, which starts out as a simple directed DeBruijn graph. This involves decomposing the reference sequence into a succession of kmers (pronounced "kay-mers"), which are small sequence subunits that are k bases long. Each kmer sequence overlaps the previous kmer by k-1 bases. The resulting graph can be represented as a series of nodes and connecting edges indicating the sequential relationship between the adjacent bases. At this point, all the connecting edges have a weight of 0.

+

In addition to the graph, we also build a hash table of unique kmers, which we use to keep track of the position of nodes in the graph. At the beginning, the hash table only contains unique kmers found in the reference sequence, but we will add to it in the next step.

+

A note about kmer size: by default, the program will attempt to build two separate graphs, using kmers of 10 and 25 bases in size, respectively, but other kmer sizes can be specified from the command line with the -kmerSize argument. The final set of haplotypes will be selected from the union of the graphs obtained using each k.

+
+

2. Threading reads through the graph

+

This is where our simple reference graph turns into a read-threading graph, so-called because we're going to take each read in turn and try to match it to a path in the graph.

+

We start with the first read and compare its first kmer to the hash table to find if it has a match. If there is a match, we look up its position in the reference graph and record that position. If there is no match, we consider that it is a new unique kmer, so we add that unique kmer to the hash table and add a new node to the graph. In both cases, we then move on and repeat the process with the next kmer in the read until we reach the end of the read.

+

When two consecutive kmers in a read belong to two nodes that were already connected by an edge in the graph, we increase the weight of that edge by 1. If the two nodes were not connected yet, we add a new edge to the graph with a starting weight of 1. As we repeat the process on each read in turn, edge weights will accumulate along the paths that are best supported by the read data, which will help us select the most likely paths later on.

+

Note on graph complexity, cycles and non-unique kmers

+

For this process to work properly, we need the graph to be sufficiently complex (where the number of non-unique k-mers is less that 4-fold the number of unique kmers found in the data) and without cycles. In certain genomic regions where there are a lot of repeated sequences, these conditions may not be met, because repeats cause cycles and diminish the number of available unique kmers. If none of the kmer sizes provided results in a viable graph (complex enough and without cycles) the program will automatically try the operation again with larger kmer sizes. Specifically, we take the largest k provided by the user (or by the default settings) and increase it by 10 bases. If no viable graph can be obtained after iterating over increased kmer sizes 6 times, we give up and skip the active region entirely.

+
+

3. Graph refinement

+

Once all the reads have been threaded through the graph, we need to clean it up a little. The main cleaning-up operation is called pruning (like the gardening technique). The goal of the pruning operation is to remove noise due to errors. The basic idea is that sections of the graph that are supported by very few reads are most probably the result of stochastic errors, so we are going to remove any sections that are supported by fewer than a certain threshold number of reads. By default the threshold value is 2, but this can be controlled from the command line using the -minPruning argument. In practice, this means that linear chains in the graph (linear sequence of vertices and edges without any branching) where all edges have fewer than 2 supporting reads will be removed. Increasing the threshold value will lead to faster processing and higher specificity, but will decrease sensitivity. Decreasing this value will do the opposite, decreasing specificity but increasing sensitivity.

+

At this stage, the program also performs graph refinement operations, such as recovering dangling heads and tails from the splice junctions to compensate for issues that are related to limitations in graph assembly.

+

Note that if you are calling multiple samples together, the program also looks at how many of the samples support each segment, and only prunes segments for which fewer than a certain number of samples have the minimum required number of supporting reads. By default this sample number is 1, so as long as one sample in the cohort passes the pruning threshold, the segment will NOT be pruned. This is designed to avoid losing singletons (variants that are unique to a single sample in a cohort). This parameter can also be controlled from the command line using the -minPruningSamples argument, but keep in mind that increasing the default value may lead to decreased sensitivity.

+
+

4. Select best haplotypes

+

Now that the graph is all cleaned up, the program builds haplotype sequences by traversing all possible paths in the graph and calculates a likelihood score for each one. This score is calculated as the product of transition probabilities of the path edges, where the transition probability of an edge is computed as the number of reads supporting that edge divided by the sum of the support of all edges that share that same source vertex.

+

In order to limit the amount of computation needed for the next step, we limit the number of haplotypes that will be considered for each value of k (remember that the program builds graphs for multiple kmer sizes). This is easy to do since we conveniently have scores for each haplotype; all we need to do is select the N haplotypes with the best scores. By default that number is very generously set to 128 (so the program would proceed to the next step with up to 128 haplotypes per value of k) but this can be adjusted from the command line using the -maxNumHaplotypesInPopulation argument. You would mainly want to decrease this number in order to improve speed; increasing that number would rarely be reasonable, if ever.

+
+

5. Identify potential variation sites

+

Once we have a list of plausible haplotypes, we perform a Smith-Waterman alignment (SWA) of each haplotype to the original reference sequence across the active region in order to reconstruct a CIGAR string for the haplotype. Note that indels will be left-aligned; that is, their start position will be set as the leftmost position possible.

+

This finally yields the potential variation sites that will be put through the variant modeling step next, bringing us back to the "classic" variant calling methods (as used by GATK's UnifiedGenotyper and Samtools' mpileup). Note that this list of candidate sites is essentially a super-set of what will eventually be the final set of called variants. Every site that will be called variant is in the super-set, but not every site that is in the super-set will be called variant.

\ No newline at end of file diff --git a/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md b/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md new file mode 100644 index 000000000..a567950a1 --- /dev/null +++ b/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md @@ -0,0 +1,18 @@ +## HC step 3 : Evaluating the evidence for haplotypes and variant alleles + +http://gatkforums.broadinstitute.org/gatk/discussion/4441/hc-step-3-evaluating-the-evidence-for-haplotypes-and-variant-alleles + +

This document describes the procedure used by HaplotypeCaller to evaluate the evidence for variant alleles based on candidate haplotypes determined in the previous step for a given ActiveRegion. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general HaplotypeCaller documentation.

+

Overview

+

The previous step produced a list of candidate haplotypes for each ActiveRegion, as well as a list of candidate variant sites borne by the non-reference haplotypes. Now, we need to evaluate how much evidence there is in the data to support each haplotype. This is done by aligning each sequence read to each haplotype using the PairHMM algorithm, which produces per-read likelihoods for each haplotype. From that, we'll be able to derive how much evidence there is in the data to support each variant allele at the candidate sites, and that produces the actual numbers that will finally be used to assign a genotype to the sample.

+
+

1. Evaluating the evidence for each candidate haplotype

+

We originally obtained our list of haplotypes for the ActiveRegion by constructing an assembly graph and selecting the most likely paths in the graph by counting the number of supporting reads for each path. That was a fairly naive evaluation of the evidence, done over all reads in aggregate, and was only meant to serve as a preliminary filter to whittle down the number of possible combinations that we're going to look at in this next step.

+

Now we want to do a much more thorough evaluation of how much evidence we have for each haplotype. So we're going to take each individual read and align it against each haplotype in turn (including the reference haplotype) using the PairHMM algorithm (see Durbin et al., 1998). If you're not familiar with PairHMM, it's a lot like the BLAST algorithm, in that it's a pairwise alignment method that uses a Hidden Markov Model (HMM) and produces a likelihood score. In this use of the PairHMM, the output score expresses the likelihood of observing the read given the haplotype by taking into account the information we have about the quality of the data (i.e. the base quality scores and indel quality scores). Note: If reads from a pair overlap at a site and they have the same base, the base quality is capped at Q20 for both reads (Q20 is half the expected PCR error rate). If they do not agree, we set both base qualities to Q0.

+

This produces a big table of likelihoods where the columns are haplotypes and the rows are individual sequence reads. (example figure TBD)

+

The table essentially represents how much supporting evidence there is for each haplotype (including the reference), itemized by read.

+
+

2. Evaluating the evidence for each candidate site and corresponding alleles

+

Having per-read likelihoods for entire haplotypes is great, but ultimately we want to know how much evidence there is for individual alleles at the candidate sites that we identified in the previous step. To find out, we take the per-read likelihoods of the haplotypes and marginalize them over alleles, which produces per-read likelihoods for each allele at a given site. In practice, this means that for each candidate site, we're going to decide how much support each read contributes for each allele, based on the per-read haplotype likelihoods that were produced by the PairHMM.

+

This may sound complicated, but the procedure is actually very simple -- there is no real calculation involved, just cherry-picking appropriate values from the table of per-read likelihoods of haplotypes into a new table that will contain per-read likelihoods of alleles. This is how it happens. For a given site, we list all the alleles observed in the data (including the reference allele). Then, for each read, we look at the haplotypes that support each allele; we select the haplotype that has the highest likelihood for that read, and we write that likelihood in the new table. And that's it! For a given allele, the total likelihood will be the product of all the per-read likelihoods. (example fig TBD)

+

At the end of this step, sites where there is sufficient evidence for at least one of the variant alleles considered will be called variant, and a genotype will be assigned to the sample in the next (final) step.

\ No newline at end of file diff --git a/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md b/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md new file mode 100644 index 000000000..84551805b --- /dev/null +++ b/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md @@ -0,0 +1,51 @@ +## HC step 4: Assigning per-sample genotypes + +http://gatkforums.broadinstitute.org/gatk/discussion/4442/hc-step-4-assigning-per-sample-genotypes + +

This document describes the procedure used by HaplotypeCaller to assign genotypes to individual samples based on the allele likelihoods calculated in the previous step. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general HaplotypeCaller documentation. See also the documentation on the QUAL score as well as PL and GQ.

+

Note that this describes the regular mode of HaplotypeCaller, which does not emit an estimate of reference confidence. For details on how the reference confidence model works and is applied in -ERC modes (GVCF and BP_RESOLUTION) please see the reference confidence model documentation.

+

Overview

+

The previous step produced a table of per-read allele likelihoods for each candidate variant site under consideration. Now, all that remains to do is to evaluate those likelihoods in aggregate to determine what is the most likely genotype of the sample at each site. This is done by applying Bayes' theorem to calculate the likelihoods of each possible genotype, and selecting the most likely. This produces a genotype call as well as the calculation of various metrics that will be annotated in the output VCF if a variant call is emitted.

+
+

1. Preliminary assumptions / limitations

+

Quality

+

Keep in mind that we are trying to infer the genotype of each sample given the observed sequence data, so the degree of confidence we can have in a genotype depends on both the quality and the quantity of the available data. By definition, low coverage and low quality will both lead to lower confidence calls. The GATK only uses reads that satisfy certain mapping quality thresholds, and only uses “good” bases that satisfy certain base quality thresholds (see documentation for default values).

+

Ploidy

+

Both the HaplotypeCaller and GenotypeGVCFs (but not UnifiedGenotyper) assume that the organism of study is diploid by default, but desired ploidy can be set using the -ploidy argument. The ploidy is taken into account in the mathematical development of the Bayesian calculation. The generalized form of the genotyping algorithm that can handle ploidies other than 2 is available as of version 3.3-0. Note that using ploidy for pooled experiments is subject to some practical limitations due to the number of possible combinations resulting from the interaction between ploidy and the number of alternate alleles that are considered (currently, the maximum "workable" ploidy is ~20 for a max number of alt alleles = 6). Future developments will aim to mitigate those limitations.

+

Paired end reads

+

Reads that are mates in the same pair are not handled together in the reassembly, but if they overlap, there is some special handling to ensure they are not counted as independent observations.

+

Single-sample vs multi-sample

+

We apply different genotyping models when genotyping a single sample as opposed to multiple samples together (as done by HaplotypeCaller on multiple inputs or GenotypeGVCFs on multiple GVCFs). The multi-sample case is not currently documented for the public but is an extension of previous work by Heng Li and others.

+
+

2. Calculating genotype likelihoods using Bayes' Theorem

+

We use the approach described in Li 2011 to calculate the posterior probabilities of non-reference alleles (Methods 2.3.5 and 2.3.6) extended to handle multi-allelic variation.

+

The basic formula we use for all types of variation under consideration (SNPs, insertions and deletions) is:

+

$$ P(G|D) = \frac{ P(G) P(D|G) }{ \sum_{i} P(G_i) P(D|G_i) } $$

+

If that is meaningless to you, please don't freak out -- we're going to break it down and go through all the components one by one. First of all, the term on the left:

+

$$ P(G|D) $$

+

is the quantity we are trying to calculate for each possible genotype: the conditional probability of the genotype G given the observed data D.

+

Now let's break down the term on the right:

+

$$ \frac{ P(G) P(D|G) }{ \sum_{i} P(G_i) P(D|G_i) } $$

+

We can ignore the denominator (bottom of the fraction) because it ends up being the same for all the genotypes, and the point of calculating this likelihood is to determine the most likely genotype. The important part is the numerator (top of the fraction):

+

$$ P(G) P(D|G) $$

+

which is composed of two things: the prior probability of the genotype and the conditional probability of the data given the genotype.

+

The first one is the easiest to understand. The prior probability of the genotype G:

+

$$ P(G) $$

+

represents how probably we expect to see this genotype based on previous observations, studies of the population, and so on. By default, the GATK tools use a flat prior (always the same value) but you can input your own set of priors if you have information about the frequency of certain genotypes in the population you're studying.

+

The second one is a little trickier to understand if you're not familiar with Bayesian statistics. It is called the conditional probability of the data given the genotype, but what does that mean? Assuming that the genotype G is the true genotype,

+

$$ P(D|G) $$

+

is the probability of observing the sequence data that we have in hand. That is, how likely would we be to pull out a read with a particular sequence from an individual that has this particular genotype? We don't have that number yet, so this requires a little more calculation, using the following formula:

+

$$ P(D|G) = \prod{j} \left( \frac{P(D_j | H_1)}{2} + \frac{P(D_j | H_2)}{2} \right) $$

+

You'll notice that this is where the diploid assumption comes into play, since here we decomposed the genotype G into:

+

$$ G = H_1H_2 $$

+

which allows for exactly two possible haplotypes. In future versions we'll have a generalized form of this that will allow for any number of haplotypes.

+

Now, back to our calculation, what's left to figure out is this:

+

$$ P(D_j|H_n) $$

+

which as it turns out is the conditional probability of the data given a particular haplotype (or specifically, a particular allele), aggregated over all supporting reads. Conveniently, that is exactly what we calculated in Step 3 of the HaplotypeCaller process, when we used the PairHMM to produce the likelihoods of each read against each haplotype, and then marginalized them to find the likelihoods of each read for each allele under consideration. So all we have to do at this point is plug the values from that table into the equation above, and we can work our way back up to obtain:

+

$$ P(G|D) $$

+

for the genotype G.

+
+

3. Selecting a genotype and emitting the call record

+

We go through the process of calculating a likelihood for each possible genotype based on the alleles that were observed at the site, considering every possible combination of alleles. For example, if we see an A and a T at a site, the possible genotypes are AA, AT and TT, and we end up with 3 corresponding probabilities. We pick the largest one, which corresponds to the most likely genotype, and assign that to the sample.

+

Note that depending on the variant calling options specified in the command-line, we may only emit records for actual variant sites (where at least one sample has a genotype other than homozygous-reference) or we may also emit records for reference sites. The latter is discussed in the reference confidence model documentation.

+

Assuming that we have a non-ref genotype, all that remains is to calculate the various site-level and genotype-level metrics that will be emitted as annotations in the variant record, including QUAL as well as PL and GQ -- see the linked docs for details. For more information on how the other variant context metrics are calculated, please see the corresponding variant annotations documentation.

\ No newline at end of file diff --git a/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md b/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md new file mode 100644 index 000000000..0f405c2e9 --- /dev/null +++ b/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md @@ -0,0 +1,15 @@ +## How the HaplotypeCaller's reference confidence model works + +http://gatkforums.broadinstitute.org/gatk/discussion/4042/how-the-haplotypecallers-reference-confidence-model-works + +

This document describes the reference confidence model applied by HaplotypeCaller to generate genomic VCFs (gVCFS), invoked by -ERC GVCF or -ERC BP_RESOLUTION (see the FAQ on gVCFs for format details).

+

Please note that this document may be expanded with more detailed information in the near future.

+

How it works

+

The mode works by assembling the reads to create potential haplotypes, realigning the reads to their most likely haplotypes, and then projecting these reads back onto the reference sequence via their haplotypes to compute alignments of the reads to the reference. For each position in the genome we have either an ALT call (via the standard calling mechanism) or we can estimate the chance that some (unknown) non-reference allele is segregating at this position by examining the realigned reads that span the reference base. At this base we perform two calculations:

+ +

Based on this, we emit the genotype likelihoods (PL) and compute the GQ (from the PLs) for the least confidence of these two models.

+

We use a symbolic allele pair, <NON_REF>, to indicate that the site is not homozygous reference, and because we have an ALT allele we can provide allele-specific AD and PL field values.

+

For details of the gVCF format, please see the document that explains what is a gVCF.

\ No newline at end of file diff --git a/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md b/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md new file mode 100644 index 000000000..4fa5578c1 --- /dev/null +++ b/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md @@ -0,0 +1,20 @@ +## Introduction to the GATK Best Practices workflows + +http://gatkforums.broadinstitute.org/gatk/discussion/4066/introduction-to-the-gatk-best-practices-workflows + +This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set. +

The "GATK Best Practices" are workflow descriptions that provide step-by-step recommendations for getting the best analysis results possible out of high-throughput sequencing data. At present, we provide the following Best Practice workflows:

+ +

These recommendations have been developed by the GATK development team over years of analysis work on many of the Broad Institute's sequencing projects, and are applied in the Broad's production pipelines. As a general rule, the command-line arguments and parameters given in the documentation examples are meant to be broadly applicable.

+
+

Important notes on context and caveats

+

Our testing focuses largely on data from human whole-genome or whole-exome samples sequenced with Illumina technology, so if you are working with different types of data or experimental designs, you may need to adapt certain branches of the workflow, as well as certain parameter selections and values. Unfortunately we are not able to provide official recommendations on how to deal with very different experimental designs or divergent datatypes (such as Ion Torrent).

+

In addition, the illustrations and tutorials provided in these pages tend to assume a simple experimental design where each sample is used to produce one DNA library that is sequenced separately on one lane of the machine. See the Guide for help dealing with other experimental designs.

+

Finally, please be aware that several key steps in the Best Practices workflow make use of existing resources such as known variants, which are readily available for humans (we provide several useful resource datasets for download from our FTP server). If no such resources are available for your organism, you may need to bootstrap your own or use alternative methods. We have documented useful methods to do this wherever possible, but be aware than some issues are currently still without a good solution.

+
+Important note on GATK versions + +The Best Practices have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the Version History section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the Version History section) to ensure that you are using the appropriate recommendations for your version. \ No newline at end of file diff --git a/doc_archive/methods/Local_Realignment_around_Indels.md b/doc_archive/methods/Local_Realignment_around_Indels.md new file mode 100644 index 000000000..9a55f4c26 --- /dev/null +++ b/doc_archive/methods/Local_Realignment_around_Indels.md @@ -0,0 +1,40 @@ +## Local Realignment around Indels + +http://gatkforums.broadinstitute.org/gatk/discussion/38/local-realignment-around-indels + +

For a discussion of the implications of removing indel realignment from workflows, see Blog#7847 from June 2016.

+
+

Realigner Target Creator

+

For a complete, detailed argument reference, refer to the GATK document page here. +


+

+

Indel Realigner

+

For a complete, detailed argument reference, refer to the GATK document page here. +


+

+
+

Running the Indel Realigner only at known sites

+

While we advocate for using the Indel Realigner over an aggregated bam using the full Smith-Waterman alignment algorithm, it will work for just a single lane of sequencing data when run in -knownsOnly mode. Novel sites obviously won't be cleaned up, but the majority of a single individual's short indels will already have been seen in dbSNP and/or 1000 Genomes. One would employ the known-only/lane-level realignment strategy in a large-scale project (e.g. 1000 Genomes) where computation time is severely constrained and limited. We modify the example arguments from above to reflect the command-lines necessary for known-only/lane-level cleaning. +

The RealignerTargetCreator step would need to be done just once for a single set of indels; so as long as the set of known indels doesn't change, the output.intervals file from below would never need to be recalculated. +

+
+ java -Xmx1g -jar /path/to/GenomeAnalysisTK.jar \
+  -T RealignerTargetCreator \
+  -R /path/to/reference.fasta \
+  -o /path/to/output.intervals \
+  -known /path/to/indel_calls.vcf
+
+

The IndelRealigner step needs to be run on every bam file. +

+
+java -Xmx4g -Djava.io.tmpdir=/path/to/tmpdir \
+  -jar /path/to/GenomeAnalysisTK.jar \
+  -I <lane-level.bam> \
+  -R <ref.fasta> \
+  -T IndelRealigner \
+  -targetIntervals <intervalListFromStep1Above.intervals> \
+  -o <realignedBam.bam> \
+  -known /path/to/indel_calls.vcf
+  --consensusDeterminationModel KNOWNS_ONLY \
+  -LOD 0.4
+
\ No newline at end of file diff --git a/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md b/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md new file mode 100644 index 000000000..9972b9acb --- /dev/null +++ b/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md @@ -0,0 +1,80 @@ +## Math notes: How PL is calculated in HaplotypeCaller + +http://gatkforums.broadinstitute.org/gatk/discussion/5913/math-notes-how-pl-is-calculated-in-haplotypecaller + +

PL is a sample-level annotation calculated by GATK variant callers such as HaplotypeCaller, recorded in the FORMAT/sample columns of variant records in VCF files. This annotation represents the normalized Phred-scaled likelihoods of the genotypes considered in the variant record for each sample, as described here.

+

This article clarifies how the PL values are calculated and how this relates to the value of the GQ field.

+
+

Contents

+
    +
  1. The basic math
  2. +
  3. Example and interpretation
  4. +
  5. Special case: non-reference confidence model (GVCF mode)
  6. +
+
+

1. The basic math

+

The basic formula for calculating PL is:

+

$$ PL = -10 * \log{P(Genotype | Data)} $$

+

where P(Genotype | Data) is the conditional probability of the Genotype given the sequence Data that we have observed. The process by which we determine the value of P(Genotype | Data) is described in the genotyping section of the Haplotype Caller documentation.

+

Once we have that probability, we simply take the log of it and multiply it by -10 to put it into Phred scale. Then we normalize the values across all genotypes so that the PL value of the most likely genotype is 0, which we do simply by subtracting the value of the lowest PL from all the values.

+

The reason we like to work in Phred scale is because it makes it much easier to work with the very small numbers involved in these calculations. One thing to keep in mind of course is that Phred is a log scale, so whenever we need to do a division or multiplication operation (e.g. multiplying probabilities), in Phred scale this will be done as a subtraction or addition.

+
+

2. Example and interpretation

+

Here’s a worked-out example to illustrate this process. Suppose we have a site where the reference allele is A, we observed one read that has a non-reference allele T at the position of interest, and we have in hand the conditional probabilities calculated by HaplotypeCaller based on that one read (if we had more reads, their contributions would be multiplied -- or in log space, added).

+

Please note that the values chosen for this example have been simplified and may not be reflective of actual probabilities calculated by Haplotype Caller.

+
# Alleles
+Reference: A
+Read: T
+
+# Conditional probabilities calculated by HC 
+P(AA | Data) = 0.000001
+P(AT | Data) = 0.000100
+P(TT | Data) = 0.010000
+

Calculate the raw PL values

+

We want to determine the PLs of the genotype being 0/0, 0/1, and 1/1, respectively. So we apply the formula given earlier, which yields the following values:

+ + + + + + + + + + + + + + + + + +
GenotypeA/AA/TT/T
Raw PL-10 * log(0.000001) = 60-10 * log(0.000100) = 40-10 * log(0.010000) = 20
+

Our first observation here is that the genotype for which the conditional probability was the highest turns out to get the lowest PL value. This is expected because, as described in the VCF FAQ, the PL is the likelihood of the genotype, which means (rather unintuitively if you’re not a stats buff) it is the probability that the genotype is not correct. So, low values mean a genotype is more likely, and high values means it’s less likely.

+

Normalize

+

At this point we have one more small transformation to make before we emit the final PL values to the VCF: we are going to normalize the values so that the lowest PL value is zero, and the rest are scaled relative to that. Since we’re in log space, we do this simply by subtracting the lowest value, 20, from the others, yielding the following final PL values:

+ + + + + + + + + + + + + + + + + +
GenotypeA/AA/TT/T
Normalized PL60 - 20 = 4040 - 20 = 2020 - 20 = 0
+

We see that there is a direct relationship between the scaling of the PLs and the original probabilities: we had chosen probabilities that were each 100 times more or less likely than the next, and in the final PLs we see that the values are spaced out by a factor of 20, which is the Phred-scale equivalent of 100. This gives us a very convenient way to estimate how the numbers relate to each other -- and how reliable the genotype assignment is -- with just a glance at the PL field in the VCF record.

+

Genotype quality

+

We actually formalize this assessment of genotype quality in the GQ annotation, as described also in the VCF FAQ.The value of GQ is simply the difference between the second lowest PL and the lowest PL (which is always 0). So, in our example GQ = 20 - 0 = 20. Note that the value of GQ is capped at 99 for practical reasons, so even if the calculated GQ is higher, the value emitted to the VCF will be 99.

+
+

3. Special case: non-reference confidence model (GVCF mode)

+

When you run HaplotypeCaller with -ERC GVCF to produce a gVCF, there is an additional calculation to determine the genotype likelihoods associated with the symbolic <NON-REF> allele (which represents the possibilities that remain once you’ve eliminated the REF allele and any ALT alleles that are being evaluated explicitly).

+

The PL values for any possible genotype that includes the <NON-REF> allele have to be calculated a little differently than what is explained above because HaplotypeCaller cannot directly determine the conditional probabilities of genotypes involving <NON-REF>. Instead, it uses base quality scores to model the genotype likelihoods.

\ No newline at end of file diff --git a/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md b/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md new file mode 100644 index 000000000..3f9e57aa4 --- /dev/null +++ b/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md @@ -0,0 +1,68 @@ +## Math notes: Understanding the QUAL score and its limitations + +http://gatkforums.broadinstitute.org/gatk/discussion/7258/math-notes-understanding-the-qual-score-and-its-limitations + +

It used to be that the first rule of GATK was: don't talk about the QUAL score. No more! This document covers the key points in loving detail. Figures are hand-drawn and scanned for now; we'll try to redo them cleanly when we find a bit of time (don't hold your breath though).

+
+

What is the QUAL score?

+

It's the Phred-scaled posterior probability that all samples in your callset are homozygous reference.

+
+

Okay, but really, what does it tell us?

+

Basically, we're trying to give you the probability that all the variant evidence you saw in your data is wrong.

+

If you have just a handful of low quality reads, your QUAL will be pretty low. Possibly too low to emit -- we typically use a threshold of 10 to emit, 30 to call in genotyping, either via HaplotypeCaller in "normal" (non-GVCF) mode or via GenotypeGVCFs (in the GVCF workflow, HaplotyeCaller sets both emit and call thresholds to 0 and emits everything to the GVCF).

+

However, if you have a lot of variant reads, your QUAL will be much higher. But it only describes the probability that all of your data is erroneous, so it has trouble distinguishing between a small number of reads with high quality mismatches or a large number of reads with low quality mismatches. That's why we recommend using QualByDepth (the QUAL normalized by depth of reads supporting the variant) as an annotation for VQSR because that will yield higher annotation values for high quality reads and lower values for big piles of weak evidence.

+
+

I know the PLs give the genotype likelihoods for each sample, but how do we combine them for all samples?

+

Heng Li's 2011 paper, section 2.3.5 (there are other copies elsewhere) gives the equations for the biallelic case. It's a recursive relation, so we have to use a dynamic programming algorithm (as you may have seen in the chapter on pairwise alignments in the Durbin et al. "Biological Sequence Analysis" book).

+

This lovely diagram lays it all out:

+ +

S_1...S_N are your N samples, which we're going to evaluate cumulatively as we move across the columns of the matrix. Here we're being very general and allowing each sample to have a different ploidy, which we'll represent with p_i. Thus the total number of chromosomes is Sum{p_i}=P.

+

We're interested in the S_N column because that represents the AC calculations once we take into account all N samples. The S_N column still isn't our final joint likelihood because we added the samples in a particular order, but more on that later.

+

We calculate the joint likelihood across samples for all ACs from zero to the total number of chromosomes. We look at all ACs because we also use this calculation to determine the MLEAC that gets reported as part of the "genotyping" process. In the matrix above, we're indexing by i for sample and j for allele count (AC). g_i represents the genotype of the ith sample in terms of its number of alt alleles, i.e. for homRef g_i=0. Note that uses a different approach to break things down than Heng Li's paper, but it's more intuitive with the indexing. And remember this is the biallelic case, so we can assume any non-reference alleles are the same single alt allele. L(g_i) is the likelihood of the genotype, which we can get from sample i's PLs (after we un-Phred scale them, that is).

+

The "matrix" is triangular because as AC increases, we have to allocate a certain number of samples as being homozygous variant, so those have g_i = 2 with probability 1. Here we show the calculation to fill in the z_ij cell in the matrix, which is the cell corresponding to seeing j alt alleles after taking into account i samples. If sample i is diploid, there are three cells we need to take into account (because i can have 3 genotypes -- 0/0, 0/1, and 1/1 corresponding to g_i={0,1,2}), all of which come from the column where we looked at i-1 samples.

+

Thus z_ij is the sum of entries where i-1 samples had j alts (z_i-1,j,and sample i is homRef), where i-1 samples had j-1 alts (z_i-1,j-1 and sample i is het) and where i=1 samples had j-2 alts (z_i-1,j-2 and sample i is homVar), taking into account the binomial coefficient (binomial because we're biallelic here so we're only interested in the ref set and the alt set) for the number of ways to arrange i's chromosomes.

+

By the time we get to column S_N, we've accumulated all the PL data for all the samples. We can then get the likelihood that AC=j in our callset by using the entry in the row according to AC=j and dividing it by the binomial coefficient for the total number of chromosomes (P) with j alt alleles to account for the fact that we could see those alt chromosomes in our samples in any order.

+
+

Wait, that's just a likelihood. But you said that the QUAL is a posterior? So that means there's a prior?

+

Yep! In short, the prior based on AC is Pr(AC = i; i > 0) = θ/i making Pr(AC = 0) = 1 – ΣP>=i>0Pr(AC = i)

+
+

What's the long version?

+

The prior, which is uniform across all sites, comes from population genetics theory, specifically coalescent theory. Let's start by defining some of our population genetics terminology. In the GATK docs, we use θ as the population heterozygosity under the neutral mutation model. Heterozygosity is the probability that two alleles drawn at random from the population will be different by state. In modern sequencing terms, that just means that there will be a variant in one with respect to the other. Note that two alleles can be identical by state but different by origin, i.e. the same variant occurred independently. If we assume that all loci are equally likely to be variant (which we know in modern times to be false, but this assumption underlies some important population genetics theories that we us as approximations) then we can also describe θ as the rate at which variants occur in the genome, 1 per 1/θ basepairs.

+

From Gillespie, "a coalescent is the lineage of alleles in a sample [as in cohort of individuals samples from the population] traced backwards in time to their common ancestor allele." Forward in time, the splits in the tree can be thought of as mutation events that generate new alleles. Backwards in time, they are referred to as coalescent events because two branches of the tree coalesce, or come together.

+ +

Each split in the coalescent represents the occurrence of a variant (let's say that each left branch is unchanged and the right branch picks up the new variant). Allele A never saw any variants, but one occurred separating A from B/C/D at -t3. Then another occurred separating B/C from D at -t2, and a final one separating B from C at -t1. So allele A is still "wild type" with no variants. Allele B has only variant -t3. Allele C has two variants: t3 and t1. Allele D has two variants: t3 and t2. So variant t3 has AC=3 (three alleles stemming from its right, non-reference branch), t2 has AC=1 and t1 has AC=1. Time here is given in generations of the population, so multiple generations can occur without there being a mutational event leading to a new allele.

+

The total time in the coalescent is measured as the sum of the lengths of all the branches of the tree describing the coalescent. For the figure, Tc = 4t1 + 3(t2-t1) + 2*(t3-t2). If we define Ti as the time required to reduce a coalescent with i alleles to one with i-1 alleles, we can write Tc as 4T4 + 3T3 + 2T2. In the forward direction, then Ti becomes the amount of time required to introduce a new mutation into a population of i-1 distinct alleles.

+

To derive an expected value for Ti, let's look at how each allele is derived from its ancestors in a population of n alleles within N samples under the assumption that a coalescence has not occurred, i.e. that each allele has a different ancestor in the previous generation because there were no coalescence events (or mutations in the forward time direction). The first (reference) allele (A in the diagram) is guaranteed to have an ancestor in the first generation because there were no mutations. The second allele has to have a different ancestor than the first allele or else they would be derived from the same source and thusly the same allele because there were no mutations in this generation. The second allele has a different ancestor with probability 1-1/(2N) = (2N-1)/(2N) (where we're assuming ploidy=2 as we usually do for population genetics of humans). Note that there are 2N possible ancestor alleles and 2N-1 that are different from the first allele. The probability that the third allele has a distinct ancestor, given that the first two do not share an ancestor, is (2N-2)/(2N), making the total probability of three alleles with three different ancestors:

+

$$ \dfrac{2N-1}{2N} \times \dfrac{2N-2}{2N} $$

+

We can continue this pattern for all n alleles to arrive at the probability that all n alleles have different ancestors, i.e. that no coalescent event (or variant event) has occurred:

+

$$ \left ( 1-\dfrac{1}{2N} \right )\times \left ( 1-\dfrac{2}{2N} \right )\times \cdots \times \left ( 1- \dfrac{n-1}{2N} \right ) $$

+

And if we multiply terms and approximate terms with N^2 in the denominator is small enough to be ignored we arrive at:

+

$$ 1- \dfrac{1}{2N}-\dfrac{2}{2N}- \cdots - \dfrac{n-1}{2N} $$

+

The probability of a coalescence occurring is the complement of the probability that it does not occur, giving:

+

$$ \dfrac {1+2+\cdots+(n-1)}{2N} = \dfrac{n(n-1))}{4N} $$

+

Which is the probability of a coalescence in any particular generation. We can now describe the probability distribution of the time to the first coalescence as a geometric distribution where the probability of success is:

+

$$ E[T_n] = \dfrac{4N}{n(n-1))} $$

+

Giving the expectation of the time to coalescence as:

+

$$ E[T_i] = \dfrac{4N}{i(i-1))} $$

+

We can generalize this to any coalescent event i as:

+

$$ Tc = \sum{i=2}^{n}iT_i $$

+

Which is a generalization of the example worked out above based on the figure. The expectation of the time spent in the coalescent is then:

+

$$ E[Tc] = E \left[ \sum{i=2}^{n}iTi\right ] = \sum{i=2}^{n}iE[Ti] = 4N \sum{i=2}^{n}\dfrac{1}{i-1} $$

+

The expected number of variants (Sn, called "segregating sites" in the old-school pop gen vernacular) is the neutral mutation rate (u) times the expected amount of time in the coalescent. A mutation can occur on any branch of the coalescent, so we want to sum the time in all branches to allow for all possibilities -- we did this above.

+

So the expected number of variants can be expressed in terms of the heterozygosity, which, if we describe it as a rate per basepair as above, allows us to describe the probability of a variant occurring at a given locus, forming the prior for our QUAL score. If we assume a cohort of unrelated individuals, the occurrence of any variant with AC > 1 must the result of that variant occurring multiple times independently at the same locus. If we now assume the coalescent is restricted to lineage of variants at a single position, we can reframe E[Sn] in terms of AC instead of number of alleles. Then we can convert the index of the sum to be AC (the number of mutations, but restricted to the same locus) using i = i' + 1 (because the set n originally includes the reference allele) so that the new +where N is the number of chromosomes in the cohort.

+

From there, we can show that the Pr[AC=i] = θ/i

+

$$ E[S_n] = uE[Tc]=\theta\sum{i=2}^{n}\dfrac{1}{i-1} = \dfrac{\theta}1+\dfrac{\theta}2+\cdots+\dfrac{\theta}{n-1} $$

+

(The theory presented here comes from Chapter 2 of "Population Genetics: A Concise Guide" by John H. Gillespie)

+
+

And the final QUAL calculation?

+

The posterior is simply:

+

P(AC = i|D) = Lk(D | AC = i) Pr(AC = i) / P(D)

+

QUAL = Phred ( AC = 0 | D).

+
+

Okay, but biallelic sites are boring. I like working with big callsets and multiallelic sites. How does the math change in that case?

+

Well, the short answer is that it gets a lot more complicated. Where we had a 2-D matrix for the biallelic case, we'll have a N-dimensional volume for a site with N alleles (including the reference.)

+

Another lovely illustration helps us wrap our puny human brains around this idea:

+ +

Where p is ploidy, s is number of samples, a is number of alleles -- that's it.

+

So we use some approximations in order to get you your results in a reasonable amount of time. Those have been working out pretty well so far, but there are a few cases where they don't do as well, so we're looking into improving our approximations so nobody loses any rare alleles. Stay tuned!

\ No newline at end of file diff --git a/doc_archive/methods/Performing_sequence_coverage_analysis.md b/doc_archive/methods/Performing_sequence_coverage_analysis.md new file mode 100644 index 000000000..0625bbe92 --- /dev/null +++ b/doc_archive/methods/Performing_sequence_coverage_analysis.md @@ -0,0 +1,76 @@ +## Performing sequence coverage analysis + +http://gatkforums.broadinstitute.org/gatk/discussion/40/performing-sequence-coverage-analysis + +

Overview

+

This document describes the tools and concepts involved in performing sequence coverage analysis, where the purpose is to answer the common question: "(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?".

+

The tools involved are the following:

+ +

For an overview of the major annotations that are used by variant callers to express read depth at a variant site, and guidelines for using those metrics to evaluate variants, please see this document.

+
+

Introduction to coverage analysis as a QC method

+

Coverage analysis generally aims to answer the common question: "(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?".

+

This section is incomplete.

+
+

Using DepthOfCoverage to QC whole-genome data

+

DepthOfCoverage is a coverage profiler for a (possibly multi-sample) bam file. It uses a granular histogram that can be user-specified to present useful aggregate coverage data. It reports the following metrics over the entire .bam file:

+ +

That last matrix is key to answering the question posed above, so we recommend running this tool on all samples together.

+

Note that DepthOfCoverage can be configured to output these statistics aggregated over genes by providing it with a RefSeq gene list.

+

DepthOfCoverage also outputs, by default, the total coverage at every locus, and the coverage per sample and/or read group. This behavior can optionally be turned off, or switched to base count mode, where base counts will be output at each locus, rather than total depth.

+

To get a summary of coverage by each gene, you may supply a refseq (or alternative) gene list via the argument

+
-geneList /path/to/gene/list.txt
+

The provided gene list must be of the following format:

+
585     NM_001005484    chr1    +       58953   59871   58953   59871   1       58953,  59871,  0       OR4F5   cmpl    cmpl    0,
+587     NM_001005224    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F3   cmpl    cmpl    0,
+587     NM_001005277    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F16  cmpl    cmpl    0,
+587     NM_001005221    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F29  cmpl    cmpl    0,
+589     NM_001005224    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F3   cmpl    cmpl    0,
+589     NM_001005277    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F16  cmpl    cmpl    0,
+589     NM_001005221    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F29  cmpl    cmpl    0,
+

For users who have access to internal Broad resources, the properly-formatted file containing refseq genes and transcripts is located at

+
/humgen/gsa-hpprojects/GATK/data/refGene.sorted.txt
+

If you do not have access (if you don't know, you probably don't have it), you can generate your own as described here.

+

If you supply the -geneList argument, DepthOfCoverage will output an additional summary file that looks as follows:

+
Gene_Name     Total_Cvg       Avg_Cvg       Sample_1_Total_Cvg    Sample_1_Avg_Cvg    Sample_1_Cvg_Q3       Sample_1_Cvg_Median      Sample_1_Cvg_Q1
+SORT1    594710  238.27  594710  238.27  165     245     330
+NOTCH2  3011542 357.84  3011542 357.84  222     399     &gt;500
+LMNA    563183  186.73  563183  186.73  116     187     262
+NOS1AP  513031  203.50  513031  203.50  91      191     290
+

Note that the gene coverage will be aggregated only over samples (not read groups, libraries, or other types). The -geneList argument also requires specific intervals within genes to be given (say, the particular exons you are interested in, or the entire gene), and it functions by aggregating coverage from the interval level to the gene level, by referencing each interval to the gene in which it falls. Because by-gene aggregation looks for intervals that overlap genes, -geneList is ignored if -omitIntervals is thrown.

+
+

Using DiagnoseTargets to QC whole-exome data

+

DiagnoseTargets produces a pseudo-VCF file that provides a "CallableStatus" judgment for each position or range of positions in the input bam file. The possible judgments are as follows:

+ \ No newline at end of file diff --git a/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md b/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md new file mode 100644 index 000000000..f723118b2 --- /dev/null +++ b/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md @@ -0,0 +1,55 @@ +## Purpose and operation of Read-backed Phasing + +http://gatkforums.broadinstitute.org/gatk/discussion/45/purpose-and-operation-of-read-backed-phasing + +

This document describes the underlying concepts of physical phasing as applied in the ReadBackedPhasing tool. For a complete, detailed argument reference, refer to the tool documentation page.

+

Note that as of GATK 3.3, physical phasing is performed automatically by HaplotypeCaller when it is run in -ERC GVCF or -ERC BP_RESOLUTION mode, so post-processing variant calls with ReadBackedPhasing is no longer necessary unless you want to merge consecutive variants into MNPs.

+
+

Underlying concepts

+

The biological unit of inheritance from each parent in a diploid organism is a set of single chromosomes, so that a diploid organism contains a set of pairs of corresponding chromosomes. The full sequence of each inherited chromosome is also known as a haplotype. It is critical to ascertain which variants are associated with one another in a particular individual. For example, if an individual's DNA possesses two consecutive heterozygous sites in a protein-coding sequence, there are two alternative scenarios of how these variants interact and affect the phenotype of the individual. In one scenario, they are on two different chromosomes, so each one has its own separate effect. On the other hand, if they co-occur on the same chromosome, they are thus expressed in the same protein molecule; moreover, if they are within the same codon, they are highly likely to encode an amino acid that is non-synonymous (relative to the other chromosome). The ReadBackedPhasing program serves to discover these haplotypes based on high-throughput sequencing reads.

+
+

How it works

+

The first step in phasing is to call variants ("genotype calling") using a SAM/BAM file of reads aligned to the reference genome -- this results in a VCF file. Using the VCF file and the SAM/BAM reads file, the ReadBackedPhasing tool considers all reads within a Bayesian framework and attempts to find the local haplotype with the highest probability, based on the reads observed.

+

The local haplotype and its phasing is encoded in the VCF file as a "|" symbol (which indicates that the alleles of the genotype correspond to the same order as the alleles for the genotype at the preceding variant site). For example, the following VCF indicates that SAMP1 is heterozygous at chromosome 20 positions 332341 and 332503, and the reference base at the first position (A) is on the same chromosome of SAMP1 as the alternate base at the latter position on that chromosome (G), and vice versa (G with C):

+
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMP1   
+chr20   332341  rs6076509   A   G   470.60  PASS    AB=0.46;AC=1;AF=0.50;AN=2;DB;DP=52;Dels=0.00;HRun=1;HaplotypeScore=0.98;MQ=59.11;MQ0=0;OQ=627.69;QD=12.07;SB=-145.57    GT:DP:GL:GQ 0/1:46:-79.92,-13.87,-84.22:99
+chr20   332503  rs6133033   C   G   726.23  PASS    AB=0.57;AC=1;AF=0.50;AN=2;DB;DP=61;Dels=0.00;HRun=1;HaplotypeScore=0.95;MQ=60.00;MQ0=0;OQ=894.70;QD=14.67;SB=-472.75    GT:DP:GL:GQ:PQ  1|0:60:-110.83,-18.08,-149.73:99:126.93
+

The per-sample per-genotype PQ field is used to provide a Phred-scaled phasing quality score based on the statistical Bayesian framework employed for phasing. For cases of homozygous sites that lie in between phased heterozygous sites, these homozygous sites will be phased with the same quality as the next heterozygous site.

+

Note that this tool can only handle diploid data properly. If your organism of interest is polyploid or if you are working with data from pooling experiments, you should not run this tool on your data.

+
+

More detailed aspects of semantics of phasing in the VCF format

+ +

For example, consider the following records from the VCF file:

+
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMP1   SAMP2
+chr1    1   .   A   G   99  PASS    .   GT:GL:GQ    0/1:-100,0,-100:99  0/1:-100,0,-100:99
+chr1    2   .   A   G   99  PASS    .   GT:GL:GQ:PQ 1|1:-100,0,-100:99:60   0|1:-100,0,-100:99:50
+chr1    3   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:60   0|0:-100,0,-100:99:60
+chr1    4   .   A   G   99  FAIL    .   GT:GL:GQ    0/1:-100,0,-100:99  0/1:-100,0,-100:99
+chr1    5   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:70   1|0:-100,0,-100:99:60
+chr1    6   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0/1:-100,0,-100:99  1|1:-100,0,-100:99:70
+chr1    7   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:80   0|1:-100,0,-100:99:70
+chr1    8   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:90   0|1:-100,0,-100:99:80
+

The proper interpretation of these records is that SAMP1 has the following haplotypes at positions 1-5 of chromosome 1:

+
AGAAA
+GGGAG
+

And two haplotypes at positions 6-8:

+
AAA
+GGG
+

And, SAMP2 has the two haplotypes at positions 1-8:

+
AAAAGGAA
+GGAAAGGG
+

Note that we have excluded the non-PASS SNP call (at chr1:4), thus assuming that both samples are homozygous reference at that site.

\ No newline at end of file diff --git a/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md b/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md new file mode 100644 index 000000000..0be1c374a --- /dev/null +++ b/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md @@ -0,0 +1,729 @@ +## Reference implementation: PairedEndSingleSampleWf pipeline + +http://gatkforums.broadinstitute.org/gatk/discussion/7899/reference-implementation-pairedendsinglesamplewf-pipeline + +

+
+

This document describes the workflow and details pertinent parameters of the PairedEndSingleSampleWf pipeline, which implements GATK Best Practices (ca. June 2016) for pre-processing human germline whole-genome sequencing (WGS) data. This pipeline uses GRCh38 as the reference genome and, as the name implies, is specific to processing paired end reads for a single sample. It begins with unaligned paired reads in BAM format and results in a sample-level SNP and INDEL variant callset in GVCF format.

+ +

The diagram above shows the relationship between the WORKFLOW steps that call on specific TASKS. Certain steps use genomic intervals to parallelize processes, and these are boxed in the workflow diagram. An overview of the data transformations is given in the WORKFLOW definitions section and granular details are given in the TASK definitions section in the order shown below.

+
+

Jump to a section

+

WORKFLOW definition overview

+
    +
  1. Map with BWA-MEM and merge to create clean BAM
  2. +
  3. Flag duplicates with MarkDuplicates
  4. +
  5. Base quality score recalibration
  6. +
  7. Call SNP and INDEL variants with HaplotypeCaller
  8. +
+

TASK definitions overview

+ +
+

What is NOT covered

+ +

Related resources

+ +
+

Requirements

+

Software

+ +
DOCKER_VERSION="1.8.1"
+PICARD_VERSION="1.1099"
+GATK35_VERSION="3.5-0-g36282e4"
+GATK4_VERSION="4.alpha-249-g7df4044"
+SAMTOOLS_VER="1.3.1"
+BWA_VER="0.7.13-r1126"
+

Scripts and data

+ +
+

+

WORKFLOW definition overview

+

Below we see that the workflow name is PairedEndSingleSampleWorkflow.

+

[0.0]

+ +

After the workflow name, the WORKFLOW definition lists the variables that can stand in for files, parameters or even parts of commands within tasks, e.g. the command for BWA alignment (L549). The actual files are given in an accompanying JSON file.

+

[0.1]

+ +

The WORKFLOW definition then outlines the tasks that it will perform. Because tasks may be listed in any order, it is the WORKFLOW definition that defines the order in which steps are run.

+

Let's break down the workflow into steps and examine their component commands.

+

back to top

+
+

+

1. Map with BWA-MEM and merge to create clean BAM

+

This step takes the unaligned BAM, aligns with BWA-MEM, merges information between the unaligned and aligned BAM and fixes tags and sorts the BAM.

+ +

▶︎ Observe the nesting of commands via their relative indentation. Our script writers use these indentations not because they make a difference for Cromwell interpretation but because they allow us human readers to visually comprehend where the scattering applies. In box [1.1] below, we see the scattering defined in L558 applies to processes in boxes [1.2], [1.3] and [1.4] in that the script nests, or indents further in, the commands for these processes within the scattering command.

+ +

[1.0]

+ +

[1.1]

+ +

[1.2]

+ +

[1.3]

+ +

[1.4]

+ +

back to top

+
+

+

2. Flag duplicates with MarkDuplicates

+

This step aggregates sample BAMs, flags duplicate sets, fixes tags and coordinate sorts. It starts with the output of [1.3]

+ +

[2.0]

+ +

[2.1]

+ +
+

+

3. Base quality score recalibration

+

This step creates intervals for scattering, performs BQSR, merges back the scattered results into a single file and finally compresses the BAM to CRAM format.

+ +

[3.0]

+ +

[3.1]

+ +

[3.2]

+ +

[3.3]

+ +

[3.4]

+ +

[3.5]

+ +

[3.6]

+ +

back to top

+
+

+

4. Call SNP and INDEL variants with HaplotypeCaller

+

This final step uses HaplotypeCaller to call variants over intervals then merges data into a GVCF for the sample, the final output of the workflow.

+ +

▶︎ For this pipeline workflow's setup, fifty parallel processes makes sense for a genome of 3 billion basepairs. However, given the same setup, the 50-way split is overkill for a genome of 370 million basepairs as in the case of the pufferfish.

+ +

[4.0]

+ +

[4.1]

+ +

[4.2]

+ +

[4.3]

+ +

back to top

+
+

+

TASK DEFINITIONS

+

GetBwaVersion

+

This task obtains the version of BWA to later notate within the BAM program group (@PG) line.

+

+
# Get version of BWA
+task GetBwaVersion {
+  command {
+    /usr/gitc/bwa 2>&1 | \
+    grep -e '^Version' | \
+    sed 's/Version: //'
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "1 GB"
+  }
+  output {
+    String version = read_string(stdout())
+  }
+}
+

+

SamToFastqAndBwaMem

+

The input to this task is an unaligned queryname-sorted BAM and the output is an aligned query-grouped BAM. This step pipes three processes: (i) conversion of BAM to FASTQ reads, (ii) [alternate-contig-aware alignment with BWA-MEM and (iii) conversion of SAM to BAM reads. BWA-MEM requires FASTQ reads as input and produces SAM format reads. This task maps the reads using the BWA command defined as a string variable and in this workflow this string is defined in [0.1].

+ +

The alt-aware alignment depends on use of GRCh38 as the reference, the versions 0.7.13+ of BWA and the presence of BWA's ALT index from bwa-kit. If the ref_alt ALT index has no content or is not present, then the script exits with an exit 1 error. What this means is that this task is only compatible with a reference with ALT contigs and it only runs in an alt-aware manner.

+
# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment
+task SamToFastqAndBwaMem {
+  File input_bam
+  String bwa_commandline
+  String output_bam_basename
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+
+  # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit),
+  # listing the reference contigs that are "alternative".
+  File ref_alt
+
+  File ref_amb
+  File ref_ann
+  File ref_bwt
+  File ref_pac
+  File ref_sa
+  Int disk_size
+  Int preemptible_tries
+
+  command <<<
+    set -o pipefail
+    # set the bash variable needed for the command-line
+    bash_ref_fasta=${ref_fasta}
+    # if ref_alt has data in it,
+    if [ -s ${ref_alt} ]; then
+      java -Xmx3000m -jar /usr/gitc/picard.jar \
+        SamToFastq \
+        INPUT=${input_bam} \
+        FASTQ=/dev/stdout \
+        INTERLEAVE=true \
+        NON_PF=true | \
+      /usr/gitc/${bwa_commandline} /dev/stdin -  2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \
+      samtools view -1 - > ${output_bam_basename}.bam && \
+      grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \
+      grep -v "read 0 ALT contigs"
+
+    # else ref_alt is empty or could not be found
+    else
+      exit 1;
+    fi
+  >>>
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "14 GB"
+    cpu: "16"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log"
+  }
+}
+

+

MergeBamAlignment

+

This step takes an unmapped BAM and the aligned BAM and merges information from each. Reads, sequence and quality information and meta information from the unmapped BAM merge with the alignment information in the aligned BAM. The BWA version the script obtains from task GetBwaVersion is used here in the program group (@PG) bwamem. What is imperative for this step, that is implied by the script, is that the sort order of the unmapped and aligned BAMs are identical, i.e. query-group sorted. The BWA-MEM alignment step outputs reads in exactly the same order as they are input and so groups mates, secondary and supplementary alignments together for a given read name. The merging step requires both files maintain this ordering and will produce a final merged BAM in the same query-grouped order given the SORT_ORDER="unsorted" parameter. This has implications for how the MarkDuplicates task will flag duplicate sets.

+

Because the ATTRIBUTES_TO_RETAIN option is set to X0, any aligner-specific tags that are literally X0 will carryover to the merged BAM. BWA-MEM does not output such a tag but does output XS and XA tags for suboptimal alignment score and alternative hits, respectively. However, these do not carryover into the merged BAM. Merging retains certain tags from either input BAM (RG, SA, MD, NM, AS and OQ if present), replaces the PG tag as the command below defines and adds new tags (MC, MQ and FT).

+

▶︎ Note the NM tag values will be incorrect at this point and the UQ tag is absent. Update and addition of these are dependent on coordinate sort order. Specifically, the script uses a separate SortAndFixTags task to fix NM tags and add UQ tags.

+

The UNMAP_CONTAMINANT_READS=true option applies to likely cross-species contamination, e.g. bacterial contamination. MergeBamAlignment identifies reads that are (i) softclipped on both ends and (ii) map with less than 32 basepairs as contaminant. For a similar feature in GATK, see OverclippedReadFilter. If MergeBamAlignment determines a read is contaminant, then the mate is also considered contaminant. MergeBamAlignment unmaps the pair of reads by (i) setting the 0x4 flag bit, (ii) replacing column 3's contig name with an asterisk *, (iii) replacing columns 4 and 5 (POS and MAPQ) with zeros, and (iv) adding the FT tag to indicate the reason for unmapping the read, e.g. FT:Z:Cross-species contamination. The records retain their CIGAR strings. Note other processes also use the FT tag, e.g. to indicate reasons for setting the QCFAIL 0x200 flag bit, and will use different tag descriptions.

+
# Merge original input uBAM file with BWA-aligned BAM file
+task MergeBamAlignment {
+  File unmapped_bam
+  String bwa_commandline
+  String bwa_version
+  File aligned_bam
+  String output_bam_basename
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    # set the bash variable needed for the command-line
+    bash_ref_fasta=${ref_fasta}
+    java -Xmx3000m -jar /usr/gitc/picard.jar \
+      MergeBamAlignment \
+      VALIDATION_STRINGENCY=SILENT \
+      EXPECTED_ORIENTATIONS=FR \
+      ATTRIBUTES_TO_RETAIN=X0 \
+      ALIGNED_BAM=${aligned_bam} \
+      UNMAPPED_BAM=${unmapped_bam} \
+      OUTPUT=${output_bam_basename}.bam \
+      REFERENCE_SEQUENCE=${ref_fasta} \
+      PAIRED_RUN=true \
+      SORT_ORDER="unsorted" \
+      IS_BISULFITE_SEQUENCE=false \
+      ALIGNED_READS_ONLY=false \
+      CLIP_ADAPTERS=false \
+      MAX_RECORDS_IN_RAM=2000000 \
+      ADD_MATE_CIGAR=true \
+      MAX_INSERTIONS_OR_DELETIONS=-1 \
+      PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
+      PROGRAM_RECORD_ID="bwamem" \
+      PROGRAM_GROUP_VERSION="${bwa_version}" \
+      PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \
+      PROGRAM_GROUP_NAME="bwamem" \
+      UNMAP_CONTAMINANT_READS=true
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+  }
+}
+

+

MarkDuplicates

+

This task flags duplicate reads. Because the input is query-group-sorted, MarkDuplicates flags with the 0x400 bitwise SAM flag duplicate primary alignments as well as the duplicate set's secondary and supplementary alignments. Also, for singly mapping mates, duplicate flagging extends to cover unmapped mates. These extensions are features that are only available to query-group-sorted BAMs.

+

This command uses the ASSUME_SORT_ORDER="queryname" parameter to tell the tool the sort order to expect. Within the context of this workflow, at the point this task is called, we will have avoided any active sorting that would label the BAM header. We know that our original flowcell BAM is queryname-sorted and that BWA-MEM maintains this order to give us query-grouped alignments.

+

The OPTICAL_DUPLICATE_PIXEL_DISTANCE of 2500 is set for Illumina sequencers that use patterned flowcells to estimate the number of sequencer duplicates. Sequencer duplicates are a subspecies of the duplicates that the tool flags. The Illumina HiSeq X and HiSeq 4000 platforms use patterened flowcells. If estimating library complexity (see section Duplicate metrics in brief) is important to you, then adjust the OPTICAL_DUPLICATE_PIXEL_DISTANCE appropriately for your sequencer platform.

+

Finally, in this task and others, we produce an MD5 file with the CREATE_MD5_FILE=true option. This creates a 128-bit hash value using the MD5 algorithm that is to files much like a fingerprint is to an individual. Compare MD5 values to verify data integrity, e.g. after moving or copying large files.

+
# Mark duplicate reads to avoid counting non-independent observations
+task MarkDuplicates {
+  Array[File] input_bams
+  String output_bam_basename
+  String metrics_filename
+  Int disk_size
+
+ # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
+ # This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment.
+ # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
+  command {
+    java -Xmx4000m -jar /usr/gitc/picard.jar \
+      MarkDuplicates \
+      INPUT=${sep=' INPUT=' input_bams} \
+      OUTPUT=${output_bam_basename}.bam \
+      METRICS_FILE=${metrics_filename} \
+      VALIDATION_STRINGENCY=SILENT \
+      OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
+      ASSUME_SORT_ORDER="queryname"
+      CREATE_MD5_FILE=true
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "7 GB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File duplicate_metrics = "${metrics_filename}"
+  }
+}
+

+

SortAndFixTags

+

This task (i) sorts reads by coordinate and then (ii) corrects the NM tag values, adds UQ tags and indexes a BAM. The task pipes the two commands. First, SortSam sorts the records by genomic coordinate using the SORT_ORDER="coordinate" option. Second, SetNmAndUqTags calculates and fixes the UQ and NM tag values in the BAM. Because CREATE_INDEX=true, SetNmAndUqTags creates the .bai index. Again, we create an MD5 file with the CREATE_MD5_FILE=true option.

+

As mentioned in the MergeBamAlignment task, tag values dependent on coordinate-sorted records require correction in this separate task given this workflow maintains query-group ordering through the pre-processing steps.

+
# Sort BAM file by coordinate order and fix tag values for NM and UQ
+task SortAndFixTags {
+  File input_bam
+  String output_bam_basename
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx4000m -jar /usr/gitc/picard.jar \
+    SortSam \
+    INPUT=${input_bam} \
+    OUTPUT=/dev/stdout \
+    SORT_ORDER="coordinate" \
+    CREATE_INDEX=false \
+    CREATE_MD5_FILE=false | \
+    java -Xmx500m -jar /usr/gitc/picard.jar \
+    SetNmAndUqTags \
+    INPUT=/dev/stdin \
+    OUTPUT=${output_bam_basename}.bam \
+    CREATE_INDEX=true \
+    CREATE_MD5_FILE=true \
+    REFERENCE_SEQUENCE=${ref_fasta}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    disks: "local-disk " + disk_size + " HDD"
+    cpu: "1"
+    memory: "5000 MB"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File output_bam_index = "${output_bam_basename}.bai"
+    File output_bam_md5 = "${output_bam_basename}.bam.md5"
+  }
+}
+

+

CreateSequenceGroupingTSV

+

This task uses a python script written as a single command using heredoc syntax to create a list of contig groupings. The workflow uses the intervals to scatter the base quality recalibration step [3] that calls on BaseRecalibrator and ApplyBQSR tasks.

+

This workflow specifically uses Python v2.7.

+

The input to the task is the reference .dict dictionary that lists contigs. The code takes the information provided by the SN and LN tags of each @SQ line in the dictionary to pair the information in a tuple list. The SN tag names a contig while the LN tag measures the contig length. This list is ordered by descending contig length.

+

The contig groupings this command creates is in WDL array format where each line represents a group and each group's members are tab-separated. The command adds contigs to each group from the previously length-sorted list in descending order and caps the sum of member lengths by the first contig's sequence length (the longest contig). This has the effect of somewhat evenly distributing sequence per group. For GRCh38, CreateSequenceGroupingTSV-stdout.log shows 18 such groups.

+

As the code adds contig names to groups, it adds a :1+ to the end of each name. This is to protect the names from downstream tool behavior that removes elements after the last : within a contig name. GRCh38 introduces contig names that include :s and removing the last element make certain contigs indistinguishable from others. With this appendage, we preserve the original contig names through downstream processes. GATK v3.5 and prior versions require this addition.

+
# Generate sets of intervals for scatter-gathering over chromosomes
+task CreateSequenceGroupingTSV {
+  File ref_dict
+  Int preemptible_tries
+
+  # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter.  It outputs to stdout
+  # where it is parsed into a wdl Array[Array[String]]
+  # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
+  command <<<
+    python <<CODE
+    with open("${ref_dict}", "r") as ref_dict_file:
+        sequence_tuple_list = []
+        longest_sequence = 0
+        for line in ref_dict_file:
+            if line.startswith("@SQ"):
+                line_split = line.split("\t")
+                # (Sequence_Name, Sequence_Length)
+                sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
+        longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
+
+    # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
+    # the last element after a :, so we add this as a sacrificial element.
+    hg38_protection_tag = ":1+"
+    # initialize the tsv string with the first sequence
+    tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
+    temp_size = sequence_tuple_list[0][1]
+    for sequence_tuple in sequence_tuple_list[1:]:
+        if temp_size + sequence_tuple[1] <= longest_sequence:
+            temp_size += sequence_tuple[1]
+            tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
+        else:
+            tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
+            temp_size = sequence_tuple[1]
+
+    print tsv_string
+    CODE
+  >>>
+  runtime {
+    docker: "python:2.7"
+    memory: "2 GB"
+    preemptible: preemptible_tries
+  }
+  output {
+    Array[Array[String]] sequence_grouping = read_tsv(stdout())
+  }
+}
+

+

BaseRecalibrator

+

The task runs BaseRecalibrator to detect errors made by the sequencer in estimating base quality scores. BaseRecalibrator builds a model of covariation from mismatches in the alignment data while excluding known variant sites and creates a recalibration report for use in the next step. The engine parameter --useOriginalQualities asks BaseRecalibrator to use original sequencer-produced base qualities stored in the OQ tag if present or otherwise use the standard QUAL score. The known sites files should include sites of known common SNPs and INDELs.

+

This task runs per interval grouping defined by each -L option. The sep in -L ${sep=" -L " sequence_group_interval} ensures each interval in the _sequence_groupinterval list is given by the command.

+
# Generate Base Quality Score Recalibration (BQSR) model
+task BaseRecalibrator {
+  File input_bam
+  File input_bam_index
+  String recalibration_report_filename
+  Array[String] sequence_group_interval
+  File dbSNP_vcf
+  File dbSNP_vcf_index
+  Array[File] known_indels_sites_VCFs
+  Array[File] known_indels_sites_indices
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
+      -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
+      -Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \
+      -jar /usr/gitc/GATK4.jar \
+      BaseRecalibrator \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      --useOriginalQualities \
+      -O ${recalibration_report_filename} \
+      -knownSites ${dbSNP_vcf} \
+      -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \
+      -L ${sep=" -L " sequence_group_interval}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "6 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File recalibration_report = "${recalibration_report_filename}"
+    #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
+    #File gc_logs = "gc_log.log"
+  }
+}
+

+

GatherBqsrReports

+

This task consolidates the recalibration reports from each sequence group interval into a single report using GatherBqsrReports.

+
# Combine multiple recalibration tables from scattered BaseRecalibrator runs
+task GatherBqsrReports {
+  Array[File] input_bqsr_reports
+  String output_report_filename
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx3000m -jar /usr/gitc/GATK4.jar \
+      GatherBQSRReports \
+      -I ${sep=' -I ' input_bqsr_reports} \
+      -O ${output_report_filename}
+    }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bqsr_report = "${output_report_filename}"
+  }
+}
+

+

ApplyBQSR

+

The task uses ApplyBQSR and the recalibration report to correct base quality scores in the BAM. Again, using parallelization, this task applies recalibration for the sequence intervals defined with -L. A resulting recalibrated BAM will contain only reads for the intervals in the applied intervals list.

+
# Apply Base Quality Score Recalibration (BQSR) model
+task ApplyBQSR {
+  File input_bam
+  File input_bam_index
+  String output_bam_basename
+  File recalibration_report
+  Array[String] sequence_group_interval
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
+      -XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \
+      -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \
+      -jar /usr/gitc/GATK4.jar \
+      ApplyBQSR \
+      --createOutputBamMD5 \
+      --addOutputSAMProgramRecord \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      --useOriginalQualities \
+      -O ${output_bam_basename}.bam \
+      -bqsr ${recalibration_report} \
+      -SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \
+      --emit_original_quals \
+      -L ${sep=" -L " sequence_group_interval}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File recalibrated_bam = "${output_bam_basename}.bam"
+    File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5"
+    #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
+    #File gc_logs = "gc_log.log"
+  }
+}
+

+

GatherBamFiles

+

This task concatenates provided BAMs in order, into a single BAM and retains the header of the first file. For this pipeline, this includes the recalibrated sequence grouped BAMs and the recalibrated unmapped reads BAM. For GRCh38, this makes 19 BAM files that the task concatenates together. The resulting BAM is already in coordinate-sorted order. The task creates a new sequence index and MD5 file for the concatenated BAM.

+
# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
+task GatherBamFiles {
+  Array[File] input_bams
+  File input_unmapped_reads_bam
+  String output_bam_basename
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx2000m -jar /usr/gitc/picard.jar \
+      GatherBamFiles \
+      INPUT=${sep=' INPUT=' input_bams} \
+      INPUT=${input_unmapped_reads_bam} \
+      OUTPUT=${output_bam_basename}.bam \
+      CREATE_INDEX=true \
+      CREATE_MD5_FILE=true
+
+    }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File output_bam_index = "${output_bam_basename}.bai"
+    File output_bam_md5 = "${output_bam_basename}.bam.md5"
+  }
+}
+

+

ConvertToCram

+

This task compresses a BAM to an even smaller CRAM format using the -C option of Samtools. The task then indexes the CRAM and renames it from {basename}.cram.crai to {basename}.crai. CRAM is a new format and tools are actively refining features for compatibility. Make sure your tool chain is compatible with CRAM before deleting BAMs. Be aware when using CRAMs that you will have to specify the identical reference genome, not just equivalent reference, with matching MD5 hashes for each contig. These can differ if the capitalization of reference sequences differ.

+
# Convert BAM file to CRAM format
+task ConvertToCram {
+  File input_bam
+  File ref_fasta
+  File ref_fasta_index
+  String output_basename
+  Int disk_size
+
+  # Note that we are not activating pre-emptible instances for this step yet,
+  #  but we should if it ends up being fairly quick
+  command <<<
+      samtools view -C -T ${ref_fasta} ${input_bam} | \
+      tee ${output_basename}.cram | \
+      md5sum > ${output_basename}.cram.md5 && \
+      samtools index ${output_basename}.cram && \
+      mv ${output_basename}.cram.crai ${output_basename}.crai
+  >>>
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File output_cram = "${output_basename}.cram"
+    File output_cram_index = "${output_basename}.crai"
+    File output_cram_md5 = "››${output_basename}.cram.md5"
+  }
+}
+

+

HaplotypeCaller

+

This task runs HaplotypeCaller on the recalibrated BAM for given intervals and produces variant calls in GVCF format. HaplotypeCaller reassembles and realign reads around variants and calls genotypes and genotype likelihoods for single nucleotide polymorphism (SNP) and insertion and deletion (INDELs) variants. Proximal variants are phased. The resulting file is a GZ compressed file, a valid VCF format file with extension .vcf.gz, containing variants for the given interval.

+ +

The -ERC GVCF or emit reference confidence mode activates two GVCF features. First, for each variant call, we now include a symbolic <NON_REF> non-reference allele. Second, for non-variant regions, we now include <NON_REF> summary blocks as calls.

+ +
# Call variants on a single sample with HaplotypeCaller to produce a GVCF
+task HaplotypeCaller {
+  File input_bam
+  File input_bam_index
+  File interval_list
+  String gvcf_basename
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Float? contamination
+  Int disk_size
+  Int preemptible_tries
+
+  # tried to find lowest memory variable where it would still work, might change once tested on JES
+  command {
+    java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \
+      -jar /usr/gitc/GATK35.jar \
+      -T HaplotypeCaller \
+      -R ${ref_fasta} \
+      -o ${gvcf_basename}.vcf.gz \
+      -I ${input_bam} \
+      -L ${interval_list} \
+      -ERC GVCF \
+      --max_alternate_alleles 3 \
+      -variant_index_parameter 128000 \
+      -variant_index_type LINEAR \
+      -contamination ${default=0 contamination} \
+      --read_filter OverclippedRead
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "10 GB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_gvcf = "${gvcf_basename}.vcf.gz"
+    File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
+  }
+}
+

+

GatherVCFs

+

The task uses MergeVcfs to combine multiple VCF files into a single VCF file and index.

+
# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
+task GatherVCFs {
+  Array[File] input_vcfs
+  Array[File] input_vcfs_indexes
+  String output_vcf_name
+  Int disk_size
+  Int preemptible_tries
+
+  # using MergeVcfs instead of GatherVcfs so we can create indices
+  # WARNING 2015-10-28 15:01:48 GatherVcfs  Index creation not currently supported when gathering block compressed VCFs.
+  command {
+    java -Xmx2g -jar /usr/gitc/picard.jar \
+    MergeVcfs \
+    INPUT=${sep=' INPUT=' input_vcfs} \
+    OUTPUT=${output_vcf_name}
+  }
+  output {
+    File output_vcf = "${output_vcf_name}"
+    File output_vcf_index = "${output_vcf_name}.tbi"
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+}
+

back to top

+
\ No newline at end of file diff --git a/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md b/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md new file mode 100644 index 000000000..a8f69fa82 --- /dev/null +++ b/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md @@ -0,0 +1,50 @@ +## Selecting variants of interest from a callset + +http://gatkforums.broadinstitute.org/gatk/discussion/54/selecting-variants-of-interest-from-a-callset + +

This document describes why you might want to extract a subset of variants from a callset and how you would achieve this.

+
+

Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). The GATK tool that we use the most for subsetting calls in various ways is SelectVariants; it enables easy and convenient subsetting of VCF files according to many criteria.

+

Select Variants operates on VCF files (also sometimes referred to as ROD in our documentation, for Reference Ordered Data) provided at the command line using the GATK's built in --variant option. You can provide multiple VCF files for Select Variants, but at least one must be named 'variant' and this will be the file (or set of files) from which variants will be selected. Other files can be used to modify the selection based on concordance or discordance between the callsets (see --discordance / --concordance arguments in the tool documentation).

+

There are many options for setting the selection criteria, depending on what you want to achieve. For example, given a single VCF file, one or more samples can be extracted from the file, based either on a complete sample name, or on a pattern match. Variants can also be selected based on annotated properties, such as depth of coverage or allele frequency. This is done using JEXL expressions; make sure to read the linked document for details, especially the section on working with complex expressions.

+

Note that in the output VCF, some annotations such as AN (number of alleles), AC (allele count), AF (allele frequency), and DP (depth of coverage) are recalculated as appropriate to accurately reflect the composition of the subset callset. See further below for an explanation of how that works.

+
+

Command-line arguments

+

For a complete, detailed argument reference, refer to the GATK document page here.

+
+

Subsetting by sample and ALT alleles

+

SelectVariants now keeps (r5832) the alt allele, even if a record is AC=0 after subsetting the site down to selected samples. For example, when selecting down to just sample NA12878 from the OMNI VCF in 1000G (1525 samples), the resulting VCF will look like:

+
1       82154   rs4477212       A       G       .       PASS    AC=0;AF=0.00;AN=2;CR=100.0;DP=0;GentrainScore=0.7826;HW=1.0     GT:GC   0/0:0.7205
+1       534247  SNP1-524110     C       T       .       PASS    AC=0;AF=0.00;AN=2;CR=99.93414;DP=0;GentrainScore=0.7423;HW=1.0  GT:GC   0/0:0.6491
+1       565286  SNP1-555149     C       T       .       PASS    AC=2;AF=1.00;AN=2;CR=98.8266;DP=0;GentrainScore=0.7029;HW=1.0   GT:GC   1/1:0.3471
+1       569624  SNP1-559487     T       C       .       PASS    AC=2;AF=1.00;AN=2;CR=97.8022;DP=0;GentrainScore=0.8070;HW=1.0   GT:GC   1/1:0.3942
+

Although NA12878 is 0/0 at the first sites, ALT allele is preserved in the VCF record. This is the correct behavior, as reducing samples down shouldn't change the character of the site, only the AC in the subpopulation. This is related to the tricky issue of isPolymorphic() vs. isVariant().

+ +

For clarity, in previous versions of SelectVariants, the first two monomorphic sites lose the ALT allele, because NA12878 is hom-ref at this site, resulting in VCF that looks like:

+
1       82154   rs4477212       A       .       .       PASS    AC=0;AF=0.00;AN=2;CR=100.0;DP=0;GentrainScore=0.7826;HW=1.0     GT:GC   0/0:0.7205
+1       534247  SNP1-524110     C       .       .       PASS    AC=0;AF=0.00;AN=2;CR=99.93414;DP=0;GentrainScore=0.7423;HW=1.0  GT:GC   0/0:0.6491
+1       565286  SNP1-555149     C       T       .       PASS    AC=2;AF=1.00;AN=2;CR=98.8266;DP=0;GentrainScore=0.7029;HW=1.0   GT:GC   1/1:0.3471
+1       569624  SNP1-559487     T       C       .       PASS    AC=2;AF=1.00;AN=2;CR=97.8022;DP=0;GentrainScore=0.8070;HW=1.0   GT:GC   1/1:0.3942
+

If you really want a VCF without monomorphic sites, use the option to drop monomorphic sites after subsetting.

+
+

How do the AC, AF, AN, and DP fields change?

+

Let's say you have a file with three samples. The numbers before the ":" will be the genotype (0/0 is hom-ref, 0/1 is het, and 1/1 is hom-var), and the number after will be the depth of coverage.

+
BOB        MARY        LINDA
+1/0:20     0/0:30      1/1:50
+

In this case, the INFO field will say AN=6, AC=3, AF=0.5, and DP=100 (in practice, I think these numbers won't necessarily add up perfectly because of some read filters we apply when calling, but it's approximately right).

+

Now imagine I only want a file with the samples "BOB" and "MARY". The new file would look like:

+
BOB        MARY
+1/0:20     0/0:30
+

The INFO field will now have to change to reflect the state of the new data. It will be AN=4, AC=1, AF=0.25, DP=50.

+

Let's pretend that MARY's genotype wasn't 0/0, but was instead "./." (no genotype could be ascertained). This would look like

+
BOB        MARY
+1/0:20     ./.:.
+

with AN=2, AC=1, AF=0.5, and DP=20.

+
+

Additional information

+

For information on how to construct regular expressions for use with this tool, see the method article on variant filtering with JEXL, or "Summary of regular-expression constructs" section here for more hardcore reading.

\ No newline at end of file diff --git a/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md b/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md new file mode 100644 index 000000000..5fb0126b6 --- /dev/null +++ b/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md @@ -0,0 +1,209 @@ +## Statistical methods: Fisher’s Exact Test + +http://gatkforums.broadinstitute.org/gatk/discussion/8056/statistical-methods-fisher-s-exact-test + +

Overview

+

Fisher’s Exact Test is a statistical test that is used to analyze contingency tables, where contingency tables are matrices that contain the frequencies of the variables in play. According to statistics lore, noted statistician R.A.Fisher invented the test to determine if Dr. Muriel Bristol could actually tell the difference between milk being added to her tea or tea being added to her milk (she couldn’t). Fisher’s Exact Test is so named because it allows us to calculate the exact p-value for the experiment, rather than having to rely on an approximation. The p-value gives us the probability of observing the set of results we obtained if the null hypothesis were true, i.e. getting those results purely by chance.

+
+

Mathematical theory

+

The Wolfram Math World article on Fisher’s Exact Test includes some very helpful information on the theoretical underpinnings of the test, as well as an example of how it can be applied.

+
+

Use in GATK

+

In GATK, we use Fisher’s Exact Test to calculate the FisherStrand annotation, which is an indicator of strand bias, a common source of artifactual calls. The test determines whether there is a difference in the number of reads that support the reference allele and alternate allele on each strand (i.e. number of reads in forward and reverse orientation). The value is reported in the FisherStrand annotation, FS in the VCF.

+
+

Example: Fisher Strand in practice

+

Note: This example follows the steps given in the Wolfram article linked above.

+

In this example, we want to determine if there is a difference in the number of reads that support the reference allele and alternate allele on each strand. Our null hypothesis is that there is no difference in the number of reads that support the reference allele and alternate allele on each strand (there is no strand bias). We will calculate a p-value that tells us the probability of observing our data if our null hypothesis is true (or, that there is no strand bias). The lower the p-value, the less likely we are to believe that there is no strand bias.

+

Let’s say we have 3 reads supporting the reference allele on the forward strand and 0 reads supporting the reference allele on the reverse strand. We also have 0 reads supporting the alternate allele on the forward strand and 3 reads supporting the alternate allele on the reverse strand.

+

The contingency table, or matrix, looks like this:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Forward StrandReverse StrandTotal
Reference Allele303
Alternate Allele033
Total336
+

At first glance, it seems obvious there is some bias going on here, because each allele is only seen either on the forward strand or the reverse strand. To determine with confidence whether there really is strand bias, we will perform Fisher’s Exact Test on this set of observations.

+

We first use the hypergeometric probability function to calculate the probability of getting the exact matrix we have above. The probability calculation for a 2 x 2 matrix is:

+

$$P = \frac{(R{1}! \times R{2}! \times C{1}! \times C{2}!) }{ N! \times \prod{ij} a{ij} } $$

+

Let’s define the variables in that equation:

+ +

Now, let’s calculate the probability P for our own matrix above:

+

$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 3! \times 0! \times 0! \times 3!} = 0.05 $$

+

That gives us the probability of observing our own data. However, for our test, we need the probability of observing our own data and more extreme data. So now we need to calculate the probability of observing more extreme data, which we'll define as any matrix that has the same row and column totals as our own, and also has a probability equal to or less than our matrix probability.

+

Matrix probability calculations

+

Let's find all possible matrices of non-negative integers that would be consistent with the given row and column totals (i.e. total number of observations) and calculate their probability using the formula for above.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Forward StrandReverse StrandTotal
Reference Allele303
Alternate Allele033
Total336
+

$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 3! \times 0! \times 0! \times 3!} = 0.05 $$

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Forward StrandReverse StrandTotal
Reference Allele213
Alternate Allele123
Total336
+

$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 2! \times 1! \times 1! \times 2!} = 0.45 $$

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Forward StrandReverse StrandTotal
Reference Allele123
Alternate Allele213
Total336
+

$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 1! \times 2! \times 2! \times 1!} = 0.45 $$

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Forward StrandReverse StrandTotal
Reference Allele033
Alternate Allele303
Total336
+

$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 0! \times 3! \times 3! \times 0!} = 0.05 $$

+

Results

+

We see that the only matrix with a probability less than or equal to our matrix is hypothetical matrix 3. We will now add the probabilities of our own matrix and matrix 3 to get the final p-value.

+

Sum all p-values less than or equal to 0.05 to calculate overall P-value:

+

$$P_{total} = 0.05\ \text{(original)} + 0.05\ \text{(matrix 3)} = 0.1 $$

+

The p-value of 0.1 tells us there is a 10% chance that there is no statistically convincing evidence of bias, despite our strong intuition that the numbers look biased. This is because there are only 6 reads, and we can’t confidently say that there is really strand bias at work based on so few reads (observations). If we had seen more, we may have had more evidence to confidently say there is bias -- or we might have realized there is no bias at this site, and the numbers we saw were an accidental effect. If you’d like to see how our confidence scales with read numbers, try working out several cases with larger numbers of reads. You’ll need to draw up a lot of possible matrices!

+

Anyway, in the GATK context we still want to transform our FS annotation value to Phred scale for convenience before writing it out to the output VCF. To get the Phred-scaled p-value, we simply plug in the p-value of 0.1 into the Phred equation like this:

+

$$ \text{Phred Score} = -10 \times \log{10} \text{p-value} = -10 \times \log{10} 0.1 = 10 $$

+

So the value of FS at this site would be 10. Note if we had a p-value of 1, meaning there is a 100% chance of there being no bias, the Phred score would be 0. So, a Phred-score closer to 0 means there is a lower chance of there being bias. Higher FS values therefore indicate more bias. See the documentation article on understanding hard-filtering recommendations for more commentary on how we interpret the value of FS in practice.

\ No newline at end of file diff --git a/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md b/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md new file mode 100644 index 000000000..65cc8451b --- /dev/null +++ b/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md @@ -0,0 +1,45 @@ +## Statistical methods: Inbreeding Coefficient + +http://gatkforums.broadinstitute.org/gatk/discussion/8032/statistical-methods-inbreeding-coefficient + +

Overview

+

Although the name Inbreeding Coefficient suggests it is a measure of inbreeding, Inbreeding Coefficient measures the excess heterozygosity at a variant site. It can be used as a proxy for poor mapping (sites that have high Inbreeding Coefficients are typically locations in the genome where the mapping is bad and reads that are in the region mismatch the region because they belong elsewhere). At least 10 samples are required (preferably many more) in order for this annotation to be calculated properly.

+

Theory

+

The Wikipedia article about Hardy-Weinberg principle includes some very helpful information on the theoretical underpinnings of the test, as Inbreeding Coefficient relies on the math behind the Hardy-Weinberg Principle.

+

Use in GATK

+

We calculate Inbreeding Coefficient as

+

$$ 1-\frac{ \text{# observed heterozygotes} }{ \text{# expected heterozygotes} } $$

+

The number of observed heterozygotes can be calculated from the data. The number of expected heterozygotes is 2pq, where p is the frequency of the reference allele and q is the frequency of the alternate allele (AF). (Please see Hardy-Weinberg Principle link above).

+

A value of 0 suggests the site is in Hardy-Weinberg Equilibrium. Negative values of Inbreeding Coefficient could mean there are too many heterozygotes and suggest a site with bad mapping. The other nice side effect is that one of the error modes in variant calling is for all calls to be heterozygous, which this metric captures nicely. This is why we recommend filtering out variants with negative Inbreeding Coefficients. Although positive values suggest too few heterozygotes, we do not recommend filtering out positive values because they could arise from admixture of different ethnic populations.

+

Important note:

+

Inbreeding Coefficient is not really robust to the assumption of being unrelated. We have found that relatedness does break down the assumptions Inbreeding Coefficient is based on. For family samples, it really depends on how many families and samples you have. For example, if you have 3 families, inbreeding coefficient is not going to work. But, if you have 10,000 samples and just a few families, it should be fine. Also, if you pass in a pedigree file (*.ped), it will use that information to calculate Inbreeding Coefficient only using the founders (i.e. individuals whose parents aren't in the callset), and as long as there are >= 10 of those, the data should be pretty good.

+
+

Example: Inbreeding Coefficient

+

In this example, let's say we are working with 100 human samples, and we are trying to calculate Inbreeding Coefficient at a site that has A for the reference allele and T for the alternate allele.

+

Step 1: Count the number of samples that have each genotype

+

HOM-REF A/A : 51 +HET A/T : 11 +HOM-VAR T/T : 38

+

Step 2: Get all necessary information to solve equation

+

We need to find the # observed hets and # expected hets:

+

$$ \text{number of observed hets} = 11 $$

+

from the number of observed A/T given above, and

+

$$ \text{number of expected hets} = 2pq * \text{total genotypes} $$

+

where 2pq is the frequency of heterozygotes according to Hardy-Weinberg Equilibrium.

+

We need to multiply that frequency by the number of all genotypes in the population to get the expected number of heterozygotes.

+

So let's calculate p:

+

$$ p = \text{frequency of ref allele} = \frac{ \text{# ref alleles} }{ \text{total # alleles} } $$ +$$ p = \frac{ 2 51 + 11 }{ 2 51 + 11 2 + 38 2} $$ +$$ p = \frac{ 113 }{ 200 } = 0.565 $$

+

And now let's calculate q:

+

$$ q = \text{frequency of alt allele} = \frac{ \text{# alt alleles} }{ \text{total # alleles} } $$ +$$ q = \frac{ 2 38 + 11 }{ 2 51 + 11 2 + 38 2 } $$ +$$ q = 87/200 = 0.435 $$

+

Remember that homozygous genotypes have two copies of the allele of interest (because we're assuming a diploid organism).

+

$$ \text{number of expected hets} = 2pq 100 $$ +$$ = 2 0.565 0.435 100 = 49.155 $$

+

Step 3: Plug in the Numbers

+

$$ \text{Inbreeding Coefficient} = 1 - \frac{ \text{# observed hets} }{ \text{#expected hets} } $$ +$$ \text{IC} = 1 - \frac{ 11 }{49.155} = 0.776 $$

+

Step 4: Interpret the output

+

Our Inbreeding Coefficient is 0.776. Because it is a positive number, we can see there are fewer than the expected number of heterozygotes according to the Hardy-Weinberg Principle. Too few heterozygotes can imply inbreeding. Depending on the cohort we are working with, this could be a sign of false positives.

\ No newline at end of file diff --git a/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md b/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md new file mode 100644 index 000000000..151d503c7 --- /dev/null +++ b/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md @@ -0,0 +1,57 @@ +## Statistical methods: Rank Sum Test + +http://gatkforums.broadinstitute.org/gatk/discussion/8031/statistical-methods-rank-sum-test + +

Overview

+

The Rank Sum Test, also known as Mann-Whitney-Wilcoxon U-test after its developers (who are variously credited in subsets and in different orders depending on the sources you read) is a statistical test that aims to determine whether there is significant difference in the values of two populations of data.

+

Theory

+

The Wikipedia article about the Rank Sum Test includes some very helpful information on the theoretical underpinnings of the test, as well as various examples of how it can be applied.

+

Use in GATK

+

This test is used by several GATK annotations, including two standard annotations that are used for variant recalibration in the Best Practices: MappingQualityRankSum and ReadPosRankSum. In all cases, the idea is to check, for a given candidate variant, whether the properties of the data that support the reference allele are similar to those of the data that support a variant allele. If they are not similar, we conclude that there may be some technical bias and that the candidate variant may be an artifact.

+
+

Example: BaseQualityRankSumTest

+

Note: this example applies Method 2 from the Wikipedia article linked above.

+

In this example, we have a set of 20 reads, 10 of which support the reference allele and 10 of which support the alternate allele. At first glance, that looks like a clear heterozygous 0/1 site. But to be thorough in our analysis and to account for any technical bias, we want to determine if there is a significant difference in the base qualities of the bases that support the reference allele vs. the bases that support the alternate allele.

+

Before we proceed, we must define our null hypothesis and alternate hypothesis.

+

-Null hypothesis: There is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.

+

-Alternate hypothesis: There is a difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.

+

Step 1: List the relevant observations

+

Reference allele base qualities: 20, 25, 26, 30, 32, 40, 47, 50, 53, 60 +Alternate allele base qualities: 0, 7, 10, 17, 20, 21, 30, 34, 40, 45

+

Step 2: Rank the observations

+

First, we arrange all the observations (base qualities) into a list of values ordered from lowest to highest (reference bases are in bold).

+

0, 7, 10, 17, 20, 20, 21, 25, 26, 30, 30, 32, 34, 40, 40, 45, 47, 50, 53, 60

+

Next we determine the ranks of the values. Since there are 20 observations (the base qualities), we have 20 ranks to assign. Whenever there are ties between observations for the rank, we take the rank to be equal to the midpoint of the ranks. For example, for 20(ref) and 20(alt), we have a tie in values, so we assign each observation a rank of (5+6)/2 = 5.5.

+

The ranks from the above list are (reference ranks are in bold):

+

1, 2, 3, 4, 5.5, 5.5, 7, 8, 9, 10.5, 10.5, 12, 13, 14.5, 14.5, 16, 17, 18, 19, 20

+

Step 3: Add up the ranks for each group

+

We now need to add up the ranks for the base qualities that came from the reference allele and the alternate allele.

+

$$ Rank_{ref} = 133.5 $$

+

$$ Rank_{alt} = 76.5 $$

+

Step 4: Calculate U for each group

+

U is a statistic that tells us the difference between the two rank totals. We can use the U statistic to calculate the z-score (explained below), which will give us our p-value.

+

Calculate U for each group (n = number of observations in each sample)

+

$$ U{ref} = \frac{ n{ref} n{alt} + n{ref} (n{ref}+ 1) }{ 2 } - Rank{ref} $$

+

$$ U{alt} = \frac{ n{alt} n{ref} + n{alt} (n{alt} + 1) }{ 2 } - Rank{alt} $$

+

$$ U_{ref} = \frac{ 10 10 + 10 11 }{ 2 } - 133.5 = 21.5 $$

+

$$ U_{alt} = \frac{ 10 10 + 10 11 }{ 2 } - 76.5 = 78.5 $$

+

Step 5: Calculate the overall z-score

+

Next, we need to calculate the z-score which will allow us to get the p-value. The z-score is a normalized score that allows us to compare the probability of the U score occurring in our distribution. +https://statistics.laerd.com/statistical-guides/standard-score.php

+

The equation to get the z-score is:

+

$$ z = \frac{U - mu}{u} $$

+

Breaking this equation down:

+

$$ z = z-score $$

+

$$ U = \text{lowest of the U scores calculated in previous steps} $$

+

$$ mu = \text{mean of the U scores above} = \frac{ n{ref} * n{alt} }{ 2 } $$

+

$$ u = \text{standard deviation of U} = \sqrt{ \frac{n{ref} * n{alt} * (n{ref} + n{alt} + 1) }{ 12 } } $$

+

To calculate our z:

+

$$ U = 21.5 $$

+

$$ mu = \frac{10 * 10 }{ 2 } = 50 $$

+

$$ u = \sqrt{ \frac{10 10 (10 + 10 + 1) }{ 12 } } = 13.229 $$

+

So altogether we have:

+

$$ z = \frac{ 21.5 - 50 }{ 13.229 } = -2.154 $$

+

Step 6: Calculate and interpret the p-value

+

The p-value is the probability of obtaining a z-score at least as extreme as the one we got, assuming the null hypothesis is true. In our example, the p-value gives us the probability that there is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele. The lower the p-value, the less likely it is that there is no difference in the base qualities.

+

Going to the z-score table, or just using a p-value calculator, we find the p-value to be 0.0312.

+

This means there is a .0312 chance that the base quality scores of the reference allele and alternate allele are the same. Assuming a p-value cutoff of 0.05, meaning there is less than 5% chance there is no difference in the two groups, and greater than or equal to 95% chance that there is a difference between the two groups, we have enough evidence to reject our null hypothesis that there is no difference in the base qualities of the reference and alternate allele. This indicates there is some bias and that the alternate allele is less well supported by the data than the allele counts suggest.

\ No newline at end of file diff --git a/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md b/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md new file mode 100644 index 000000000..b3de2a721 --- /dev/null +++ b/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md @@ -0,0 +1,75 @@ +## Understanding and adapting the generic hard-filtering recommendations + +http://gatkforums.broadinstitute.org/gatk/discussion/6925/understanding-and-adapting-the-generic-hard-filtering-recommendations + +

This document aims to provide insight into the logic of the generic hard-filtering recommendations that we provide as a substitute for VQSR. Hopefully it will also serve as a guide for adapting these recommendations or developing new filters that are appropriate for datasets that diverge significantly from what we usually work with.

+
+

Introduction

+

Hard-filtering consists of choosing specific thresholds for one or more annotations and throwing out any variants that have annotation values above or below the set thresholds. By annotations, we mean properties or statistics that describe for each variant e.g. what the sequence context is like around the variant site, how many reads covered it, how many reads covered each allele, what proportion of reads were in forward vs reverse orientation, and so on.

+

The problem with this approach is that it is very limiting because it forces you to look at each annotation dimension individually, and you end up throwing out good variants just because one of their annotations looks bad, or keeping bad variants in order to keep those good variants.

+

In contrast, VQSR is more powerful because it uses machine-learning algorithms to learn from the data what are the annotation profiles of good variants (true positives) and of bad variants (false positives) in a particular dataset. This empowers you to pull out variants based on how they cluster together along different dimensions, and liberates you to a large extent from the linear tyranny of single-dimension thresholds.

+

Unfortunately this method requires a large number of variants and well-curated known variant resources. For those of you working with small gene panels or with non-model organisms, this is a deal-breaker, and you have to fall back on hard-filtering.

+
+

Outline

+

In this article, we illustrate how the generic hard-filtering recommendations we provide relate to the distribution of annotation values we typically see in callsets produced by our variant calling tools, and how this in turn relates to the underlying physical properties of the sequence data.

+

We also use results from VQSR filtering (which we take as ground truth in this context) to highlight the limitations of hard-filtering.

+

We do this in turn for each of five annotations that are highly informative among the recommended annotations: QD, FS, MQ, MQRankSum and ReadPosRankSum. The same principles can be applied to most other annotations produced by GATK tools.

+
+

Overview of data and methods

+

Origin of the dataset

+

We called variants on a whole genome trio (samples NA12878, NA12891, NA12892, previously pre-processed) using HaplotypeCaller in GVCF mode, yielding a gVCF file for each sample. We then joint-genotyped the gVCFs using GenotypeGVCF, yielding an unfiltered VCF callset for the trio. Finally, we ran VQSR on the trio VCF, yielding the filtered callset. We will be looking at the SNPs only.

+

Plotting methods and interpretation notes

+

All plots shown below are density plots generated using the ggplot2 library in R. On the x-axis are the annotation values, and on the y-axis are the density values. The area under the density plot gives you the probability of observing the annotation values. So, the entire area under all of the plots will be equal to 1. However, if you would like to know the probability of observing an annotation value between 0 and 1, you will have to take the area under the curve between 0 and 1.

+

In plain English, this means that the plots shows you, for a given set of variants, what is the distribution of their annotation values. The caveat is that when we're comparing two or more sets of variants on the same plot, we have to keep in mind that they may contain very different numbers of variants, so the amount of variants in a given part of the distribution is not directly comparable; only their proportions are comparable.

+
+

QualByDepth (QD)

+

This is the variant confidence (from the QUAL field) divided by the unfiltered depth of non-hom-ref samples. This annotation is intended to normalize the variant quality in order to avoid inflation caused when there is deep coverage. For filtering purposes it is better to use QD than either QUAL or DP directly.

+

The generic filtering recommendation for QD is to filter out variants with QD below 2. Why is that?

+

First, let’s look at the QD values distribution for unfiltered variants. Notice the values can be anywhere from 0-40. There are two peaks where the majority of variants are (around QD = 12 and QD = 32). These two peaks correspond to variants that are mostly observed in heterozygous (het) versus mostly homozygous-variant (hom-var) states, respectively, in the called samples. This is because hom-var samples contribute twice as many reads supporting the variant than do het variants. We also see, to the left of the distribution, a "shoulder" of variants with QD hovering between 0 and 5.

+ +

We expect to see a similar distribution profile in callsets generated from most types of high-throughput sequencing data, although values where the peaks form may vary.

+

Now, let’s look at the plot of QD values for variants that passed VQSR and those that failed VQSR. Red indicates the variants that failed VQSR, and blue (green?) the variants that passed VQSR.

+ +

We see that the majority of variants filtered out correspond to that low-QD "shoulder" (remember that since this is a density plot, the y-axis indicates proportion, not number of variants); that is what we would filter out with the generic recommendation of the threshold value 2 for QD.

+

Notice however that VQSR has failed some variants that have a QD greater than 30! All those variants would have passed the hard filter threshold, but VQSR tells us that these variants looked artifactual in one or more other annotation dimensions. Conversely, although it is not obvious in the figure, we know that VQSR has passed some variants that have a QD less than 2, which hard filters would have eliminated from our callset.

+
+

FisherStrand (FS)

+

This is the Phred-scaled probability that there is strand bias at the site. Strand Bias tells us whether the alternate allele was seen more or less often on the forward or reverse strand than the reference allele. When there little to no strand bias at the site, the FS value will be close to 0.

+

Note: SB, SOR and FS are related but not the same! They all measure strand bias (a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other) in different ways. SB gives the raw counts of reads supporting each allele on the forward and reverse strand. FS is the result of using those counts in a Fisher's Exact Test. SOR is a related annotation that applies a different statistical test (using the SB counts) that is better for high coverage data.

+

Let’s look at the FS values for the unfiltered variants. The FS values have a very wide range; we made the x-axis log-scaled so the distribution is easier to see. Notice most variants have an FS value less than 10, and almost all variants have an FS value less than 100. However, there are indeed some variants with a value close to 400.

+ +

The plot below shows FS values for variants that passed VQSR and failed VQSR.

+ +

Notice most of the variants that fail have an FS value greater than 55. Our hard filtering recommendations tell us to fail variants with an FS value greater than 60. Notice that although we are able to remove many false positives by removing variants with FS greater than 60, we still keep many false positive variants. If we move the threshold to a lower value, we risk losing true positive variants.

+
+

StrandOddsRatio (SOR)

+

This is another way to estimate strand bias using a test similar to the symmetric odds ratio test. SOR was created because FS tends to penalize variants that occur at the ends of exons. Reads at the ends of exons tend to only be covered by reads in one direction and FS gives those variants a bad score. SOR will take into account the ratios of reads that cover both alleles.

+

Let’s look at the SOR values for the unfiltered variants. The SOR values range from 0 to greater than 9. Notice most variants have an SOR value less than 3, and almost all variants have an SOR value less than 9. However, there is a long tail of variants with a value greater than 9.

+ +

The plot below shows SOR values for variants that passed VQSR and failed VQSR.

+ +

Notice most of the variants that have an SOR value greater than 3 fail the VQSR filter. Although there is a non-negligible population of variants with an SOR value less than 3 that failed VQSR, our hard filtering recommendation of failing variants with an SOR value greater than 3 will at least remove the long tail of variants that show fairly clear bias according to the SOR test.

+
+

RMSMappingQuality (MQ)

+

This is the root mean square mapping quality over all the reads at the site. Instead of the average mapping quality of the site, this annotation gives the square root of the average of the squares of the mapping qualities at the site. It is meant to include the standard deviation of the mapping qualities. Including the standard deviation allows us to include the variation in the dataset. A low standard deviation means the values are all close to the mean, whereas a high standard deviation means the values are all far from the mean.When the mapping qualities are good at a site, the MQ will be around 60.

+

Now let’s check out the graph of MQ values for the unfiltered variants. Notice the very large peak around MQ = 60. Our recommendation is to fail any variant with an MQ value less than 40.0. You may argue that hard filtering any variant with an MQ value less than 50 is fine as well. This brings up an excellent point that our hard filtering recommendations are meant to be very lenient. We prefer to keep all potentially decent variants rather than get rid of a few bad variants.

+ +

Let’s look at the VQSR pass vs fail variants. At first glance, it seems like VQSR has passed the variants in the high peak and failed any variants not in the peak.

+ +

It is hard to tell which variants passed and failed, so let’s zoom in and see what exactly is happening.

+ +

The plot above shows the x-axis from 59-61. Notice the variants in blue (the ones that passed) all have MQ around 60. However, some variants in red (the ones that failed) also have an MQ around 60.

+
+

MappingQualityRankSumTest (MQRankSum)

+

This is the u-based z-approximation from the Rank Sum Test for mapping qualities. It compares the mapping qualities of the reads supporting the reference allele and the alternate allele. A positive value means the mapping qualities of the reads supporting the alternate allele are higher than those supporting the reference allele; a negative value indicates the mapping qualities of the reference allele are higher than those supporting the alternate allele. A value close to zero is best and indicates little difference between the mapping qualities.

+

Next, let’s look at the distribution of values for MQRankSum in the unfiltered variants. Notice the values range from approximately -10.5 to 6.5. Our hard filter threshold is -12.5. There are no variants in this dataset that have MQRankSum less than -10.5! In this case, hard filtering would not fail any variants based on MQRankSum. Remember, our hard filtering recommendations are meant to be very lenient. If you do plot your annotation values for your samples and find none of your variants have MQRankSum less than -12.5, you may want to refine your hard filters. Our recommendations are indeed recommendations that you the scientist will want to refine yourself.

+ +

Looking at the plot of pass VQSR vs fail VQSR variants, we see the variants with an MQRankSum value less than -2.5 fail VQSR. However, the region between -2.5 to 2.5 contains both pass and fail variants. Are you noticing a trend here? It is very difficult to pick a threshold for hard filtering. If we pick -2.5 as our hard filtering threshold, we still have many variants that fail VQSR in our dataset. If we try to get rid of those variants, we will lose some good variants as well. It is up to you to decide how many false positives you would like to remove from your dataset vs how many true positives you would like to keep and adjust your threshold based on that.

+ +
+

ReadPosRankSumTest (ReadPosRankSum)

+

This is the u-based z-approximation from the Rank Sum Test for site position within reads. It compares whether the positions of the reference and alternate alleles are different within the reads. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele; a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele. A value close to zero is best because it indicates there is little difference between the positions of the reference and alternate alleles in the reads.

+

The last annotation we will look at is ReadPosRankSum. Notice the values fall mostly between -4 and 4. Our hard filtering threshold removes any variant with a ReadPosRankSum value less than -8.0. Again, there are no variants in this dataset that have a ReadPosRankSum value less than -8.0, but some datasets might. If you plot your variant annotations and find there are no variants that have a value less than or greater than one of our recommended cutoffs, you will have to refine them yourself based on your annotation plots.

+ +

Looking at the VQSR pass vs fail variants, we can see VQSR has failed variants with ReadPosRankSum values less than -1.0 and greater than 3.5. However, notice VQSR has failed some variants that have values that pass VQSR.

+ \ No newline at end of file diff --git a/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md b/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md new file mode 100644 index 000000000..1277c20e7 --- /dev/null +++ b/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md @@ -0,0 +1,73 @@ +## Using JEXL to apply hard filters or select variants based on annotation values + +http://gatkforums.broadinstitute.org/gatk/discussion/1255/using-jexl-to-apply-hard-filters-or-select-variants-based-on-annotation-values + +

1. JEXL in a nutshell

+

JEXL stands for Java EXpression Language. It's not a part of the GATK as such; it's a software library that can be used by Java-based programs like the GATK. It can be used for many things, but in the context of the GATK, it has one very specific use: making it possible to operate on subsets of variants from VCF files based on one or more annotations, using a single command. This is typically done with walkers such as VariantFiltration and SelectVariants.

+
+

2. Basic structure of JEXL expressions for use with the GATK

+

In this context, a JEXL expression is a string (in the computing sense, i.e. a series of characters) that tells the GATK which annotations to look at and what selection rules to apply.

+

JEXL expressions contain three basic components: keys and values, connected by operators. For example, in this simple JEXL expression which selects variants whose quality score is greater than 30:

+
"QUAL > 30.0"
+ +

The complete expression must be framed by double quotes. Within this, keys are strings (typically written in uppercase or CamelCase), and values can be either strings, numbers or booleans (TRUE or FALSE) -- but if they are strings the values must be framed by single quotes, as in the following example:

+
"MY_STRING_KEY == 'foo'"
+
+

3. Evaluation on multiple annotations

+

You can build expressions that calculate a metric based on two separate annotations, for example if you want to select variants for which quality (QUAL) divided by depth of coverage (DP) is below a certain threshold value:

+
"QUAL / DP < 10.0"
+

You can also join multiple conditional statements with logical operators, for example if you want to select variants that have both sufficient quality (QUAL) and a certain depth of coverage (DP):

+
"QUAL > 30.0 && DP == 10"
+

where && is the logical "AND".

+

Or if you want to select variants that have at least one of several conditions fulfilled:

+
"QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0"
+

where || is the logical "OR".

+
+

4. Filtering on sample/genotype-level properties

+

You can also filter individual samples/genotypes in a VCF based on information from the FORMAT field. Variant Filtration will add the sample-level FT tag to the FORMAT field of filtered samples. Note however that this does not affect the record's FILTER tag. This is still a work in progress and isn't quite as flexible and powerful yet as we'd like it to be. For now, you can filter based on most fields as normal (e.g. GQ < 5.0), but the GT (genotype) field is an exception. We have put in convenience methods to enable filtering out heterozygous calls (isHet == 1), homozygous-reference calls (isHomRef == 1), and homozygous-variant calls (isHomVar == 1).

+
+

5. Important caveats

+

Sensitivity to case and type

+

You're probably used to case being important (whether letters are lowercase or UPPERCASE) but now you need to also pay attention to the type of value that is involved -- for example, numbers are differentiated between integers and floats (essentially, non-integers). These points are especially important to keep in mind:

+ +

Currently, VCF INFO field keys are case-sensitive. That means that if you have a QUAL field in uppercase in your VCF record, the system will not recognize it if you write it differently (Qual, qual or whatever) in your JEXL expression.

+ +

The types (i.e. string, integer, non-integer or boolean) used in your expression must be exactly the same as that of the value you are trying to evaluate. In other words, if you have a QUAL field with non-integer values (e.g. 45.3) and your filter expression is written as an integer (e.g. "QUAL < 50"), the system will throw a hissy fit (aka a Java exception).

+

Complex queries

+

We highly recommend that complex expressions involving multiple AND/OR operations be split up into separate expressions whenever possible to avoid confusion. If you are using complex expressions, make sure to test them on a panel of different sites with several combinations of yes/no criteria.

+
+

6. More complex JEXL magic

+

Note that this last part is fairly advanced and not for the faint of heart. To be frank, it's also explained rather more briefly than the topic deserves. But if there's enough demand for this level of usage (click the "view in forum" link and leave a comment) we'll consider producing a full-length tutorial.

+

Introducing the VariantContext object

+

When you use SelectVariants with JEXL, what happens under the hood is that the program accesses something called the VariantContext, which is a representation of the variant call with all its annotation information. The VariantContext is technically not part of GATK; it's part of the variant library included within the Picard tools source code, which GATK uses for convenience.

+

The reason we're telling you about this is that you can actually make more complex queries than what the GATK offers convenience functions for, provided you're willing to do a little digging into the VariantContext methods. This will allow you to leverage the full range of capabilities of the underlying objects from the command line.

+

In a nutshell, the VariantContext is available through the vc variable, and you just need to add method calls to that variable in your command line. The bets way to find out what methods are available is to read the VariantContext documentation on the Picard tools source code repository (on SourceForge), but we list a few examples below to whet your appetite.

+

Using VariantContext directly

+

For example, suppose I want to use SelectVariants to select all of the sites where sample NA12878 is homozygous-reference. This can be accomplished by assessing the underlying VariantContext as follows:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.getGenotype("NA12878").isHomRef()'
+

Groovy, right? Now here's a more sophisticated example of JEXL expression that finds all novel variants in the total set with allele frequency > 0.25 but not 1, is not filtered, and is non-reference in 01-0263 sample:

+
! vc.getGenotype("01-0263").isHomRef() && (vc.getID() == null || vc.getID().equals(".")) && AF > 0.25 && AF < 1.0 && vc.isNotFiltered() && vc.isSNP() -o 01-0263.high_freq_novels.vcf -sn 01-0263
+

Using the VariantContext to evaluate boolean values

+

The classic way of evaluating a boolean goes like this:

+
java -Xmx4g -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'DB'
+

But you can also use the VariantContext object like this:

+
java -Xmx4g -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.hasAttribute("DB")'
+

Using VariantContext to access annotations in multiallelic sites

+

The order of alleles in the VariantContext object is not guaranteed to be the same as in the VCF output, so accessing the AF by an index derived from a scrambled alleles array is dangerous. However! If we have the sample genotypes, there's a workaround:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R reference.fasta -V multiallelics.vcf -select 'vc.hasGenotypes() && vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) > 0.1' -o multiHighAC.vcf
+

The odd 1.0 is there because otherwise we're dividing two integers, which will always yield 0. The vc.hasGenotypes() is extra error checking. This might be slow for large files, but we could use something like this if performance is a concern:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R reference.fasta -V multiallelics.vcf -select 'vc.isBiallelic() ? AF > 0.1 : vc.hasGenotypes() && vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) > 0.1' -o multiHighAC.vcf
+

Where hopefully the ternary expression shortcuts the extra vc calls for all the biallelics.

+

Using JEXL to evaluate arrays

+

Sometimes you might want to write a JEXL expression to evaluate e.g. the AD (allelic depth) field in the FORMAT column. However, the AD is technically not an integer; rather it is a list (array) of integers. One can evaluate the array data using the "." operator. Here's an example:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.getGenotype("NA12878").getAD().0 > 10'
+

If you would like to select sites where the alternate allele frequency is greater than 50%, you can use the following expression:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select vc.getGenotype("NA12878").getAD().1 / vc.getGenotype("NA12878").getDP() > 0.50
\ No newline at end of file diff --git a/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md b/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md new file mode 100644 index 000000000..68a76b5e0 --- /dev/null +++ b/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md @@ -0,0 +1,24 @@ +## Using depth of coverage metrics for variant evaluation + +http://gatkforums.broadinstitute.org/gatk/discussion/4721/using-depth-of-coverage-metrics-for-variant-evaluation + +

Overview

+

This document describes the proper use of metrics associated with depth of coverage for the purpose of evaluating variants.

+

The metrics involved are the following:

+ +

For an overview of the tools and concepts involved in performing sequence coverage analysis, where the purpose is to answer the common question: "(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?", please see this document.

+
+

Coverage annotations: DP and AD

+

The variant callers generate two main coverage annotation metrics: the allele depth per sample (AD) and overall depth of coverage (DP, available both per sample and across all samples, with important differences), controlled by the following annotator modules:

+ +

At the sample level, these annotations are highly complementary metrics that provide two important ways of thinking about the depth of the data available for a given sample at a given site. The key difference is that the AD metric is based on unfiltered read counts while the sample-level DP is based on filtered read counts (see tool documentation for a list of read filters that are applied by default for each tool). As a result, they should be interpreted differently.

+

The sample-level DP is in some sense reflective of the power I have to determine the genotype of the sample at this site, while the AD tells me how many times I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would normally be excluded from the statistical calculations going into GQ and QUAL.

+

Note that because the AD includes reads and bases that were filtered by the caller (and in case of indels, is based on a statistical computation), it should not be used to make assumptions about the genotype that it is associated with. Ultimately, the phred-scaled genotype likelihoods (PLs) are what determines the genotype calls.

+
+

TO BE CONTINUED...

\ No newline at end of file diff --git a/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md b/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md new file mode 100644 index 000000000..5de31a649 --- /dev/null +++ b/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md @@ -0,0 +1,158 @@ +## VariantEval Evaluation Modules Glossary + +http://gatkforums.broadinstitute.org/gatk/discussion/6309/varianteval-evaluation-modules-glossary + +

Table of Contents

+

Default modules:

+ +
+

General

+

Each table has a few columns of data that will be the same across multiple evaluation modules. To avoid listing them multiple times, they will be specified here

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

+
+

+

CompOverlap

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

+
+

+

CountVariants

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

+
+

+

IndelSummary

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

+
+

+

TiTvVariantEvaluator

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

+
+

+

MultiallelicSummary

+

Example Output *

+ + +

*Output from a rare variant association study with >1500 whole genome sequenced samples

\ No newline at end of file diff --git a/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md b/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md new file mode 100644 index 000000000..01851ab6d --- /dev/null +++ b/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md @@ -0,0 +1,68 @@ +## Variant Quality Score Recalibration (VQSR) + +http://gatkforums.broadinstitute.org/gatk/discussion/39/variant-quality-score-recalibration-vqsr + +

This document describes what Variant Quality Score Recalibration (VQSR) is designed to do, and outlines how it works under the hood. The first section is a high-level overview aimed at non-specialists. Additional technical details are provided below.

+

For command-line examples and recommendations on what specific resource datasets and arguments to use for VQSR, please see this FAQ article. See the VariantRecalibrator tool doc and the ApplyRecalibration tool doc for a complete description of available command line arguments.

+

As a complement to this document, we encourage you to watch the workshop videos available in the Presentations section.

+
+

High-level overview

+

VQSR stands for “variant quality score recalibration”, which is a bad name because it’s not re-calibrating variant quality scores at all; it is calculating a new quality score that is supposedly super well calibrated (unlike the variant QUAL score which is a hot mess) called the VQSLOD (for variant quality score log-odds). I know this probably sounds like gibberish, stay with me. The purpose of this new score is to enable variant filtering in a way that allows analysts to balance sensitivity (trying to discover all the real variants) and specificity (trying to limit the false positives that creep in when filters get too lenient) as finely as possible.

+

The basic, traditional way of filtering variants is to look at various annotations (context statistics) that describe e.g. what the sequence context is like around the variant site, how many reads covered it, how many reads covered each allele, what proportion of reads were in forward vs reverse orientation; things like that -- then choose threshold values and throw out any variants that have annotation values above or below the set thresholds. The problem with this approach is that it is very limiting because it forces you to look at each annotation dimension individually, and you end up throwing out good variants just because one of their annotations looks bad, or keeping bad variants in order to keep those good variants.

+

The VQSR method, in a nutshell, uses machine learning algorithms to learn from each dataset what is the annotation profile of good variants vs. bad variants, and does so in a way that integrates information from multiple dimensions (like, 5 to 8, typically). The cool thing is that this allows us to pick out clusters of variants in a way that frees us from the traditional binary choice of “is this variant above or below the threshold for this annotation?”

+

Let’s do a quick mental visualization exercise (pending an actual figure to illustrate this), in two dimensions because our puny human brains work best at that level. Imagine a topographical map of a mountain range, with North-South and East-West axes standing in for two variant annotation scales. Your job is to define a subset of territory that contains mostly mountain peaks, and as few lowlands as possible. Traditional hard-filtering forces you to set a single longitude cutoff and a single latitude cutoff, resulting in one rectangular quadrant of the map being selected, and all the rest being greyed out. It’s about as subtle as a sledgehammer and forces you to make a lot of compromises. VQSR allows you to select contour lines around the peaks and decide how low or how high you want to go to include or exclude territory within your subset.

+

How this is achieved is another can of worms. The key point is that we use known, highly validated variant resources (omni, 100 Genomes, hapmap) to select a subset of variants within our callset that we’re really confident are probably true positives (that’s the training set). We look at the annotation profiles of those variants (in our own data!), and we from that we learn some rules about how to recognize good variants. We do something similar for bad variants as well. Then we apply the rules we learned to all of the sites, which (through some magical hand-waving) yields a single score for each variant that describes how likely it is based on all the examined dimensions. In our map analogy this is the equivalent of determining on which contour line the variant sits. Finally, we pick a threshold value indirectly by asking the question “what score do I need to choose so that e.g. 99% of the variants in my callset that are also in hapmap will be selected?”. This is called the target sensitivity. We can twist that dial in either direction depending on what is more important for our project, sensitivity or specificity.

+
+

+

Technical overview

+

The purpose of variant recalibration is to assign a well-calibrated probability to each variant call in a call set. This enables you to generate highly accurate call sets by filtering based on this single estimate for the accuracy of each call.

+

The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided as input (typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array, for humans). This adaptive error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.

+

The variant recalibrator contrastively evaluates variants in a two step process, each performed by a distinct tool:

+ +

Please see the VQSR tutorial for step-by-step instructions on running these tools.

+
+

How VariantRecalibrator works in a nutshell

+

The tool takes the overlap of the training/truth resource sets and of your callset. It models the distribution of these variants relative to the annotations you specified, and attempts to group them into clusters. Then it uses the clustering to assign VQSLOD scores to all variants. Variants that are closer to the heart of a cluster will get a higher score than variants that are outliers.

+
+

How ApplyRecalibration works in a nutshell

+

During the first part of the recalibration process, variants in your callset were given a score called VQSLOD. At the same time, variants in your training sets were also ranked by VQSLOD. When you specify a tranche sensitivity threshold with ApplyRecalibration, expressed as a percentage (e.g. 99.9%), what happens is that the program looks at what is the VQSLOD value above which 99.9% of the variants in the training callset are included. It then takes that value of VQSLOD and uses it as a threshold to filter your variants. Variants that are above the threshold pass the filter, so the FILTER field will contain PASS. Variants that are below the threshold will be filtered out; they will be written to the output file, but in the FILTER field they will have the name of the tranche they belonged to. So VQSRTrancheSNP99.90to100.00 means that the variant was in the range of VQSLODs corresponding to the remaining 0.1% of the training set, which are basically considered false positives.

+
+

Interpretation of the Gaussian mixture model plots

+

The variant recalibration step fits a Gaussian mixture model to the contextual annotations given to each variant. By fitting this probability model to the training variants (variants considered to be true-positives), a probability can be assigned to the putative novel variants (some of which will be true-positives, some of which will be false-positives). It is useful for users to see how the probability model was fit to their data. Therefore a modeling report is automatically generated each time VariantRecalibrator is run (in the above command line the report will appear as path/to/output.plots.R.pdf). For every pair-wise combination of annotations used in modeling, a 2D projection of the Gaussian mixture model is shown.

+ +

The figure shows one page of an example Gaussian mixture model report that is automatically generated by the VQSR from the example HiSeq call set. This page shows the 2D projection of mapping quality rank sum test versus Haplotype score by marginalizing over the other annotation dimensions in the model.

+

In each page there are four panels which show different ways of looking at the 2D projection of the model. The upper left panel shows the probability density function that was fit to the data. The 2D projection was created by marginalizing over the other annotation dimensions in the model via random sampling. Green areas show locations in the space that are indicative of being high quality while red areas show the lowest probability areas. In general putative SNPs that fall in the red regions will be filtered out of the recalibrated call set.

+

The remaining three panels give scatter plots in which each SNP is plotted in the two annotation dimensions as points in a point cloud. The scale for each dimension is in normalized units. The data for the three panels is the same but the points are colored in different ways to highlight different aspects of the data. In the upper right panel SNPs are colored black and red to show which SNPs are retained and filtered, respectively, by applying the VQSR procedure. The red SNPs didn't meet the given truth sensitivity threshold and so are filtered out of the call set. The lower left panel colors SNPs green, grey, and purple to give a sense of the distribution of the variants used to train the model. The green SNPs are those which were found in the training sets passed into the VariantRecalibrator step, while the purple SNPs are those which were found to be furthest away from the learned Gaussians and thus given the lowest probability of being true. Finally, the lower right panel colors each SNP by their known/novel status with blue being the known SNPs and red being the novel SNPs. Here the idea is to see if the annotation dimensions provide a clear separation between the known SNPs (most of which are true) and the novel SNPs (most of which are false).

+

An example of good clustering for SNP calls from the tutorial dataset is shown to the right. The plot shows that the training data forms a distinct cluster at low values for each of the two statistics shown (haplotype score and mapping quality bias). As the SNPs fall off the distribution in either one or both of the dimensions they are assigned a lower probability (that is, move into the red region of the model's PDF) and are filtered out. This makes sense as not only do higher values of HaplotypeScore indicate a lower chance of the data being explained by only two haplotypes but also higher values for mapping quality bias indicate more evidence of bias between the reference bases and the alternative bases. The model has captured our intuition that this area of the distribution is highly enriched for machine artifacts and putative variants here should be filtered out!

+
+

Tranches and the tranche plot

+

The recalibrated variant quality score provides a continuous estimate of the probability that each variant is true, allowing one to partition the call sets into quality tranches. The main purpose of the tranches is to establish thresholds within your data that correspond to certain levels of sensitivity relative to the truth sets. The idea is that with well calibrated variant quality scores, you can generate call sets in which each variant doesn't have to have a hard answer as to whether it is in or out of the set. If a very high accuracy call set is desired then one can use the highest tranche, but if a larger, more complete call set is a higher priority than one can dip down into lower and lower tranches. These tranches are applied to the output VCF file using the FILTER field. In this way you can choose to use some of the filtered records or only use the PASSing records.

+

The first tranche (90) which has the lowest value of truth sensitivity but the highest value of novel Ti/Tv, is exceedingly specific but less sensitive. Each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. Downstream applications can select in a principled way more specific or more sensitive call sets or incorporate directly the recalibrated quality scores to avoid entirely the need to analyze only a fixed subset of calls but rather weight individual variant calls by their probability of being real. An example tranche plot, automatically generated by the VariantRecalibrator walker, is shown below.

+ +

This is an example of a tranches plot generated for a HiSeq call set. The x-axis gives the number of novel variants called while the y-axis shows two quality metrics -- novel transition to transversion ratio and the overall truth sensitivity.

+

Note that the tranches plot is not applicable for indels and will not be generated when the tool is run in INDEL mode.

+
+

Ti/Tv-free recalibration

+

We use a Ti/Tv-free approach to variant quality score recalibration. This approach requires an additional truth data set, and cuts the VQSLOD at given sensitivities to the truth set. It has several advantages over the Ti/Tv-targeted approach:

+ +

We have used hapmap 3.3 sites as the truth set (genotypes_r27_nr.b37_fwd.vcf), but other sets of high-quality (~99% truly variable in the population) sets of sites should work just as well. In our experience, with HapMap, 99% is a good threshold, as the remaining 1% of sites often exhibit unusual features like being close to indels or are actually MNPs, and so receive a low VQSLOD score.
+Note that the expected Ti/Tv is still an available argument but it is only used for display purposes.

+
+

Finally, a couple of Frequently Asked Questions

+

- Can I use the variant quality score recalibrator with my small sequencing experiment?

+

This tool is expecting thousands of variant sites in order to achieve decent modeling with the Gaussian mixture model. Whole exome call sets work well, but anything smaller than that scale might run into difficulties.

+

One piece of advice is to turn down the number of Gaussians used during training. This can be accomplished by adding --maxGaussians 4 to your command line.

+

maxGaussians is the maximum number of different "clusters" (=Gaussians) of variants the program is "allowed" to try to identify. Lowering this number forces the program to group variants into a smaller number of clusters, which means there will be more variants in each cluster -- hopefully enough to satisfy the statistical requirements. Of course, this decreases the level of discrimination that you can achieve between variant profiles/error modes. It's all about trade-offs; and unfortunately if you don't have a lot of variants you can't afford to be very demanding in terms of resolution.

+

- Why don't all the plots get generated for me?

+

The most common problem related to this is not having Rscript accessible in your environment path. Rscript is the command line version of R that gets installed right alongside. We also make use of the ggplot2 library so please be sure to install that package as well. See the Common Problems section of the Guide for more details.

\ No newline at end of file diff --git a/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md b/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md new file mode 100644 index 000000000..48c85ae9b --- /dev/null +++ b/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md @@ -0,0 +1,66 @@ +## Allele Depth (AD) is lower than expected + +http://gatkforums.broadinstitute.org/gatk/discussion/6005/allele-depth-ad-is-lower-than-expected + +

The problem:

+

You're trying to evaluate the support for a particular call, but the numbers in the DP (total depth) and AD (allele depth) fields aren't making any sense. For example, the sum of all the ADs doesn't match up to the DP, or even more baffling, the AD for an allele that was called is zero!

+

Many users have reported being confused by variant calls where there is apparently no evidence for the called allele. For example, sometimes a VCF may contain a variant call that looks like this:

+
2 151214 . G A 673.77 . AN=2;DP=10;FS=0.000;MLEAF=0.500;MQ=56.57;MQ0=0;NCC=0;SOR=0.693 GT:AD:DP:GQ:PL 0/1:0,0:10:38:702,0,38
+

You can see in the Format field the AD values are 0 for both of the alleles. However, in the Info and FORMAT fields, the DP is 10. Because the DP in the INFO field is unfiltered and the DP in the FORMAT field is filtered, you know none of the reads were filtered out by the engine's built-in read filters. And if you look at the "bamout", you see 10 reads covering the position! So why is the VCF reporting an AD value of 0?

+
+

The explanation: uninformative reads

+

This is not actually a bug -- the program is doing what we expect; this is an interpretation problem. The answer lies in uninformative reads.

+

We call a read “uninformative” when it passes the quality filters, but the likelihood of the most likely allele given the read is not significantly larger than the likelihood of the second most likely allele given the read. Specifically, the difference between the Phred scaled likelihoods must be greater than 0.2 to be considered significant. In other words, that means the most likely allele must be 60% more likely than the second most likely allele.

+

Let’s walk through an example to make this clearer. Let’s say we have 2 reads and 2 possible alleles at a site. All of the reads have passed HaplotypeCaller’s quality filters, and the likelihoods of the alleles given the reads are in the table below.

+ + + + + + + + + + + + + + + + + + + + +
ReadsLikelihood of ALikelihood of T
13.8708e-73.6711e-7
24.9992e-72.8425e-7
+

Note: Keep in mind that HaplotypeCaller marginalizes the likelihoods of the haplotypes given the reads to get the likelihoods of the alleles given the reads. The table above shows the likelihoods of the alleles given the reads. For additional details, please see the HaplotypeCaller method documentation.

+

Now, let’s convert the likelihoods into Phred-scaled likelihoods. To do this, we simply take the log of the likelihoods.

+ + + + + + + + + + + + + + + + + + + + +
ReadsPhred-scaled likelihood of APhred-scaled likelihood of T
1-6.4122-6.4352
2-6.3011-6.5463
+

Now, we want to determine if read 1 is informative. To do this, we simply look at the Phred scaled likelihoods of the most likely allele and the second most likely allele. The Phred scaled likelihood of the most likely allele (A) is -6.4122.The Phred-scaled likelihood of the second most likely allele (T) is -6.4352. Taking the difference between the two likelihoods gives us 0.023. Because 0.023 is Less than 0.2, read 1 is considered uninformative.

+

To determine if read 2 is informative, we take -6.3011-(-6.5463). This gives us 0.2452, which is greater than 0.2. Read 2 is considered informative.

+

How does a difference of 0.2 mean the most likely allele is ~60% more likely than the second most likely allele? Well, because the likelihoods are Phred-scaled, 0.2 = 10^0.2 = 1.585 which is approximately 60% greater.

+
+

Conclusion

+

So, now that we know the math behind determining which reads are informative, let’s look at how this affects the record output to the VCF. If a read is considered informative, it gets counted toward the AD and DP of the variant allele in the output record. If a read is considered uninformative, it is counted towards the DP, but not the AD. That way, the AD value reflects how many reads actually contributed support for a given allele at the site. We would not want to include uninformative reads in the AD value because we don’t have confidence in them.

+

Please note, however, that although an uninformative read is not reported in the AD, it is still used in calculations for genotyping. In future we may add an annotation to indicate counts of reads that were considered informative vs. uninformative. Let us know in the comments if you think that would be helpful.

+

In most cases, you will have enough coverage at a site to disregard small numbers of uninformative reads. Unfortunately, sometimes uninformative reads are the only reads you have at a site. In this case, we report the potential variant allele, but keep the AD values 0. The uncertainty at the site will be reflected in the QG and PL values.

\ No newline at end of file diff --git "a/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md" "b/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md" new file mode 100644 index 000000000..edae18bf4 --- /dev/null +++ "b/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md" @@ -0,0 +1,36 @@ +## AnalyzeCovariates fails with error message "RScript exited with 1" + +http://gatkforums.broadinstitute.org/gatk/discussion/4294/analyzecovariates-fails-with-error-message-rscript-exited-with-1 + +

When you run AnalyzeCovariates to analyze your BQSR outputs, you may encounter an error starting with this line:

+
org.broadinstitute.sting.utils.R.RScriptExecutorException: RScript exited with 1. Run with -l DEBUG for more info.
+

The main reason why this error often occurs is simple, and so is the solution. The script depends on some external R libraries, so if you don't have them installed, the script fails. To find out what libraries are necessary and how to install them, you can refer to this tutorial.

+

One other common issue is that the version of ggplot2 you have installed is very recent and is not compatible with the BQSR script. If so, download this Rscript file and use it to generate the plots manually according to the instructions below.

+

If you have already checked that you have all the necessary libraries installed, you'll need to run the script manually in order to find out what is wrong. To new users, this can seem complicated, but it only takes these 3 simple steps to do it!

+

1. Re-run AnalyzeCovariates with these additional parameters:

+ +

2. Identify the lines in the log output that says what parameters the RScript is given.

+

The snippet below shows you the components of the R script command line that AnalyzeCovariates uses.

+
INFO  18:04:55,355 AnalyzeCovariates - Generating plots file 'RTest.pdf' 
+DEBUG 18:04:55,672 RecalUtils - R command line: Rscript (resource)org/broadinstitute/gatk/utils/recalibration/BQSR.R /Users/schandra/BQSR_Testing/RTest.csv /Users/schandra/BQSR_Testing/RTest.recal /Users/schandra/BQSR_Testing/RTest.pdf 
+DEBUG 18:04:55,687 RScriptExecutor - Executing: 
+DEBUG 18:04:55,688 RScriptExecutor -   Rscript 
+DEBUG 18:04:55,688 RScriptExecutor -   -e 
+DEBUG 18:04:55,688 RScriptExecutor -   tempLibDir = '/var/folders/j9/5qgr3mvj0590pd2yb9hwc15454pxz0/T/Rlib.2085451458391709180';source('/var/folders/j9/5qgr3mvj0590pd2yb9hwc15454pxz0/T/BQSR.761775214345441497.R'); 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.csv 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.recal 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.pdf 
+

So, your full command line will be:

+
RScript BQSR.R RTest.csv RTest.recal RTest.pdf
+

Please note:

+ +

3. Run the script manually with the above arguments.

+

For new users, the easiest way to do this is to do it from within an IDE program like RStudio. Or, you can start up R at the command line and run it that way, whatever you are comfortable with.

\ No newline at end of file diff --git a/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md b/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md new file mode 100644 index 000000000..8d4d9fb62 --- /dev/null +++ b/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md @@ -0,0 +1,27 @@ +## Errors about contigs in BAM or VCF files not being properly ordered or sorted + +http://gatkforums.broadinstitute.org/gatk/discussion/1328/errors-about-contigs-in-bam-or-vcf-files-not-being-properly-ordered-or-sorted + +

This is not as common as the "wrong reference build" problem, but it still pops up every now and then: a collaborator gives you a BAM or VCF file that's derived from the correct reference, but for whatever reason the contigs are not sorted in the same order. The GATK can be particular about the ordering BAM and VCF files so it will fail with an error in this case.

+

So what do you do?

+
+

For BAM files

+

You run Picard's ReorderSam tool on your BAM file, using the reference genome dictionary as a template, like this:

+
java -jar picard.jar ReorderSam \
+    I=original.bam \
+    O=reordered.bam \
+    R=reference.fasta \
+    CREATE_INDEX=TRUE
+

Where reference.fasta is your genome reference, which must be accompanied by a valid *.dict dictionary file. The CREATE_INDEX argument is optional but useful if you plan to use the resulting file directly with GATK (otherwise you'll need to run another tool to create an index).

+

Be aware that this tool will drop reads that don't have equivalent contigs in the new reference (potentially bad or not, depending on what you want). If contigs have the same name in the BAM and the new reference, this tool assumes that the alignment of the read in the new BAM is the same. This is not a liftover tool!

+
+

For VCF files

+

You run Picard's SortVcf tool on your VCF file, using the reference genome dictionary as a template, like this:

+
java -jar picard.jar SortVcf \
+    I=original.vcf \
+    O=sorted.vcf \
+    SEQUENCE_DICTIONARY=reference.dict 
+

Where reference.dict is the sequence dictionary of your genome reference.

+

Note that you may need to delete the index file that gets created automatically for your new VCF by the Picard tool. GATK will automatically regenerate an index file for your VCF.

+

Version-specific alert for GATK 3.5

+

In version 3.5, we added some beefed-up VCF sequence dictionary validation. Unfortunately, as a side effect of the additional checks, some users have experienced an error that starts with "ERROR MESSAGE: Lexicographically sorted human genome sequence detected in variant." that is due to unintentional activation of a check that is not necessary. This will be fixed in the next release; in the meantime -U ALLOW_SEQ_DICT_INCOMPATIBILITY can be used (with caution) to override the check.

\ No newline at end of file diff --git a/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md b/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md new file mode 100644 index 000000000..780d3c2ba --- /dev/null +++ b/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md @@ -0,0 +1,65 @@ +## Errors about input files having missing or incompatible contigs + +http://gatkforums.broadinstitute.org/gatk/discussion/63/errors-about-input-files-having-missing-or-incompatible-contigs + +

These errors occur when the names or sizes of contigs don't match between input files. This is a classic problem that typically happens when you get some files from collaborators, you try to use them with your own data, and GATK fails with a big fat error saying that the contigs don't match.

+

The first thing you need to do is find out which files are mismatched, because that will affect how you can fix the problem. This information is included in the error message, as shown in the examples below. You'll notice that GATK always evaluates everything relative to the reference.

+
+

BAM file contigs not matching the reference

+

A very common case we see looks like this:

+
##### ERROR MESSAGE: Input files reads and reference have incompatible contigs: Found contigs with the same name but different lengths:
+##### ERROR   contig reads = chrM / 16569
+##### ERROR   contig reference = chrM / 16571.
+##### ERROR   reads contigs = [chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY, chrM]
+##### ERROR   reference contigs = [chrM, chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY, chr1_gl000191_random, chr1_gl000192_random, chr4_ctg9_hap1, chr4_gl000193_random, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chr7_gl000195_random, chr8_gl000196_random, chr8_gl000197_random, chr9_gl000198_random, chr9_gl000199_random, chr9_gl000200_random, chr9_gl000201_random, chr11_gl000202_random, chr17_ctg5_hap1, chr17_gl000203_random, chr17_gl000204_random, chr17_gl000205_random, chr17_gl000206_random, chr18_gl000207_random, chr19_gl000208_random, chr19_gl000209_random, chr21_gl000210_random, chrUn_gl000211, chrUn_gl000212, chrUn_gl000213, chrUn_gl000214, chrUn_gl000215, chrUn_gl000216, chrUn_gl000217, chrUn_gl000218, chrUn_gl000219, chrUn_gl000220, chrUn_gl000221, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000225, chrUn_gl000226, chrUn_gl000227, chrUn_gl000228, chrUn_gl000229, chrUn_gl000230, chrUn_gl000231, chrUn_gl000232, chrUn_gl000233, chrUn_gl000234, chrUn_gl000235, chrUn_gl000236, chrUn_gl000237, chrUn_gl000238, chrUn_gl000239, chrUn_gl000240, chrUn_gl000241, chrUn_gl000242, chrUn_gl000243, chrUn_gl000244, chrUn_gl000245, chrUn_gl000246, chrUn_gl000247, chrUn_gl000248, chrUn_gl000249]
+

First, the error tells us that the mismatch is between the file containing reads, i.e. our BAM file, and the reference:

+
Input files reads and reference have incompatible contigs
+

It further tells us that the contig length doesn't match for the chrM contig:

+
Found contigs with the same name but different lengths:
+##### ERROR   contig reads = chrM / 16569
+##### ERROR   contig reference = chrM / 16571.
+

This can be caused either by using the wrong genome build version entirely, or using a reference that was hacked from a build that's very close but not identical, like b37 vs hg19, as detailed a bit more below.

+

We sometimes also see cases where people are using a very different reference; this is especially the case for non-model organisms where there is not yet a widely-accepted standard genome reference build.

+

Note that the error message also lists the content of the sequence dictionaries that it found for each file, and we see that some contigs in our reference dictionary are not listed in the BAM dictionary, but that's not a problem. If it was the opposite, with extra contigs in the BAM (or VCF), then GATK wouldn't know what to do with the reads from these extra contigs and would error out (even if we try restricting analysis using -L) with something like this:

+
#### ERROR MESSAGE: BAM file(s) do not have the contig: chrM. You are probably using a different reference than the one this file was aligned with.
+

Solution

+

If you can, simply switch to the correct reference. Note that file names may be misleading, as people will sometimes rename files willy-nilly. Sometimes you'll need to do some detective work to identify the correct reference if you inherited someone else's sequence data.

+

If that's not an option because you either can't find the correct reference or you absolutely MUST use a particular reference build, then you will need to redo the alignment altogether. Sadly there is no liftover procedure for reads. If you don't have access to the original unaligned sequence files, you can use Picard tools to revert your BAM file back to an unaligned state (either unaligned BAM or FASTQ depending on the workflow you wish to follow).

+

Special case of b37 vs. hg19

+

The b37 and hg19 human genome builds are very similar, and the canonical chromosomes (1 through 22, X and Y) only differ by their names (no prefix vs. chr prefix, respectively). If you only care about those, and don't give a flying fig about the decoys or the mitochondrial genome, you could just rename the contigs throughout your mismatching file and call it done, right?

+

Well... This can work if you do it carefully and cleanly -- but many things can go wrong during the editing process that can screw up your files even more, and it only applies to the canonical chromosomes. The mitochondrial contig is a slightly different length (see error above) in addition to having a different naming convention, and all the other contigs (decoys, herpes virus etc) don't have direct equivalents.

+

So only try that if you know what you're doing. YMMV.

+
+

VCF file contigs not matching the reference

+
ERROR MESSAGE: Input files known and reference have incompatible contigs: Found contigs with the same name but different lengths:
+ERROR contig known = chrM / 16569
+ERROR contig reference = chrM / 16571.
+

Yep, it's just like the error we had with the BAM file above. Looks like we're using the wrong genome build again and a contig length doesn't match. But this time the error tells us that the mismatch is between the file identified as known and the reference:

+
Input files known and reference have incompatible contigs
+

We know (trust me) that this is the output of a RealignerTargetCreator command, so the known file must be the VCF file provided through the known argument. Depending on the tool, the way the file is identified may vary, but the logic should be fairly obvious.

+

Solution

+

If you can, you find a version of the VCF file that is derived from the right reference. If you're working with human data and the VCF in question is just a common resource like dbsnp, you're in luck -- we provide versions of dbsnp and similar resources derived from the major human reference builds in our resource bundle (see FAQs for access details).

+
location: ftp.broadinstitute.org
+username: gsapubftp-anonymous
+

If that's not an option, then you'll have to "liftover" -- specifically, liftover the mismatching VCF to the reference you need to work with. The best tool for liftover is Picard's LiftoverVCF.

+

GATK used to include some liftover utilities (documented below for the record) but we no longer support them.

+

Liftover procedure with older versions of GATK

+

This procedure involves three steps:

+
    +
  1. Run GATK LiftoverVariants on your VCF file
  2. +
  3. Run a script to sort the lifted-over file
  4. +
  5. Filter out records whose REF field does not match the new reference
  6. +
+

We provide a script that performs those three steps for you, called liftOverVCF.pl, which is available in our public source repository -- but you have to check out a version older than 3.4 -- under the 'perl' directory. Instructions for pulling down our source code from github are available here.

+

The example below shows how you would run the script:

+
./liftOverVCF.pl \
+    -vcf calls.b36.vcf \                    # input vcf
+    -chain b36ToHg19.broad.over.chain \ # chain file
+    -out calls.hg19.vcf \                   # output vcf
+    -gatk gatk_source \                     # path to source code
+    -newRef Homo_sapiens_assembly19 \    # path to new reference base name (without extension)
+    -oldRef human_b36_both \            # path to old reference prefix (without extension)
+    -tmp /broad/shptmp [defaults to /tmp]   # temp file location (defaults to /tmp)
+

We provide several chain files to liftover between the major human reference builds, also in our resource bundle (mentioned above) in the Liftover_Chain_Files directory. If you are working with non-human organisms, we can't help you -- but others may have chain files, so ask around in your field.

+

Note that if you're at the Broad, you can access chain files to liftover from b36/hg18 to hg19 on the humgen server.

+
/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/
\ No newline at end of file diff --git a/doc_archive/problems/Errors_about_misencoded_quality_scores.md b/doc_archive/problems/Errors_about_misencoded_quality_scores.md new file mode 100644 index 000000000..77b587577 --- /dev/null +++ b/doc_archive/problems/Errors_about_misencoded_quality_scores.md @@ -0,0 +1,14 @@ +## Errors about misencoded quality scores + +http://gatkforums.broadinstitute.org/gatk/discussion/6470/errors-about-misencoded-quality-scores + +

The problem

+

You get an error like this:

+
SAM/BAM/CRAM file <filename> appears to be using the wrong encoding for quality scores
+

Why this happens

+

The standard format for quality score encodings is that Q0 == ASCII 33 according to the SAM specification. However, in some datasets (including older Illumina data), encoding starts at ASCII 64. This is a problem because the GATK assumes that it can use the quality scores as they are. If they are in fact encoded using a different scale, our tools will make an incorrect estimation of the quality of your data, and your analysis results will be off.

+

To prevent this from happening, the GATK engine performs a sanity check of the quality score encodings that will abort the program run if they are not standard (since version 2.3), and output the error message shown above.

+

Solution

+

If this happens to you, you'll need to run again with the flag --fix_misencoded_quality_scores / -fixMisencodedQuals. What will happen is that the engine will simply subtract 31 from every quality score as it is read in, and proceed with the corrected values. Output files will include the correct scores where applicable.

+

Related problems

+

In some cases the data contains a mix of encodings (which is likely to arise if you're passing in a lot of different files from different sources together), and the GATK can't automatically compensate for that. There is an argument you can use to override this check: -allowPotentiallyMisencodedQuals / --allow_potentially_misencoded_quality_scores; but you use it at your own risk. We strongly encourage you to check the encodings of your files rather than use this option.

\ No newline at end of file diff --git a/doc_archive/problems/Errors_about_read_group_(RG)_information.md b/doc_archive/problems/Errors_about_read_group_(RG)_information.md new file mode 100644 index 000000000..5f7c35377 --- /dev/null +++ b/doc_archive/problems/Errors_about_read_group_(RG)_information.md @@ -0,0 +1,37 @@ +## Errors about read group (RG) information + +http://gatkforums.broadinstitute.org/gatk/discussion/59/errors-about-read-group-rg-information + +

What are read groups?

+

See the Dictionary entry on read groups.

+

Errors about missing or undefined read groups

+

As detailed in the FAQs about input requirements, GATK expects all read groups appearing in the read data to be specified in the file header, and will fail with an error if it does not find that information (whether there is no read group information in the file, or a subset of reads do not have read groups).

+

Typically you should read group information when you perform the original alignments (with e.g. BWA, which has an option to do so). So what do you do if you forgot to do that, and you don't want to have to rerun BWA all over again?

+

Solution

+

You can use a Picard tool called AddOrReplaceReadGroups to add the missing information to your input file.

+

Here's an example:

+
# throws an error
+java -jar GenomeAnalysisTK.jar \
+    -T HaplotypeCaller \
+    -R reference.fasta \
+    -I reads_without_RG.bam \
+    -o output.vcf
+
+# fix the read groups
+java -jar picard.jar AddOrReplaceReadGroups \
+    I= reads_without_RG.bam \
+    O=  reads_with_RG.bam \
+    SORT_ORDER=coordinate \
+    RGID=foo \
+    RGLB=bar \
+    RGPL=illumina \
+    RGSM=Sample1 \
+    CREATE_INDEX=True
+
+# runs without error
+java -jar GenomeAnalysisTK.jar \
+    -T HaplotypeCaller \
+    -R reference.fasta \
+    -I reads_with_RG.bam \
+    -o output.vcf
+

Note that if you don't know what information to put in the read groups, you should ask whoever performed the sequencing or provided the BAM to give you the metadata you need.

\ No newline at end of file diff --git a/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md b/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md new file mode 100644 index 000000000..adb343831 --- /dev/null +++ b/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md @@ -0,0 +1,166 @@ +## Errors in SAM/BAM files can be diagnosed with ValidateSamFile + +http://gatkforums.broadinstitute.org/gatk/discussion/7571/errors-in-sam-bam-files-can-be-diagnosed-with-validatesamfile + +

The problem

+

You're trying to run a GATK or Picard tool that operates on a SAM or BAM file, and getting some cryptic error that doesn't clearly tell you what's wrong. Bits of the stack trace (the pile of lines in the output log that the program outputs when there is a problem) may contain the following: java.lang.String, Error Type Count, NullPointerException -- or maybe something else that doesn't mean anything to you.

+

Why this happens

+

The most frequent cause of these unexplained problems is not a bug in the program -- it's an invalid or malformed SAM/BAM file. This means that there is something wrong either with the content of the file (something important is missing) or with its format (something is written the wrong way). Invalid SAM/BAM files generally have one or more errors in the following sections: the header tags, the alignment fields, or the optional alignment tags. In addition, the SAM/BAM index file can be a source of errors as well.

+

The source of these errors is usually introduced by upstream processing tools, such as the genome mapper/aligner or any other data processing tools you may have applied before feeding the data to Picard or GATK.

+

The solution

+

To fix these problems, you first have to know what's wrong. Fortunately there's a handy Picard tool that can test for (almost) all possible SAM/BAM format errors, called ValidateSamFile.

+

We recommend the workflow included below for diagnosing problems with ValidateSamFile. This workflow will help you tackle the problem efficiently and set priorities for dealing with multiple errors (which often happens). We also outline typical solutions for common errors, but note that this is not meant to be an exhaustive list -- there are too many possible problems to tackle all of them in this document. To be clear, here we focus on diagnostics, not treatment.

+

In some cases, it may not be possible to fix some problems that are too severe, and you may need to redo the genome alignment/mapping from scratch! Consider running ValidateSamFile proactively at all key steps of your analysis pipeline to catch errors early!

+
+

Workflow for diagnosing SAM/BAM file errors with ValidateSamFile

+
+
+ +
+
+

1. Generate summary of errors

+

First, run ValidateSamFile in SUMMARY mode in order to get a summary of everything that is missing or improperly formatted in your input file. We set MODE=SUMMARY explicitly because by default the tool would just emit details about the 100 first problems it finds then quit. If you have some minor formatting issues that don't really matter but affect every read record, you won't get to see more important problems that occur later in the file.

+
$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        MODE=SUMMARY 
+

If this outputs No errors found, then your SAM/BAM file is completely valid. If you were running this purely as a preventative measure, then you're good to go and proceed to the next step in your pipeline. If you were doing this to diagnose a problem, then you're back to square one -- but at least now you know it's not likely to be a SAM/BAM file format issue. One exception: some analysis tools require Read Group tags like SM that not required by the format specification itself, so the input files will pass validation but the analysis tools will still error out. If that happens to you, check whether your files have SM tags in the @RG lines in their BAM header. That is the most common culprit.

+

However, if the command above outputs one or more of the 8 possible WARNING or 48 possible ERROR messages (see tables at the end of this document), you must proceed to the next step in the diagnostic workflow.

+

When run in SUMMARY mode, ValidateSamFile outputs a table that differentiates between two levels of error: ERROR proper and WARNING, based on the severity of problems that they would cause in downstream analysis. All problems that fall in the ERROR category must be addressed to in order to proceed with other Picard or GATK tools, while those that fall in the WARNING category may often be ignored for some, if not all subsequent analyses.

+

Example of error summary

+ + + + + + + + + + + + +
ValidateSamFile (SUMMARY) Count
ERROR:MISSING_READ_GROUP 1
ERROR:MISMATCH_MATE_ALIGNMENT_START 4
ERROR:MATES_ARE_SAME_END 894289
ERROR:CIGAR_MAPS_OFF_REFERENCE 354
ERROR:MATE_NOT_FOUND 1
ERROR:MISMATCH_FLAG_MATE_UNMAPPED 46672
ERROR:MISMATCH_READ_LENGTH_AND_E2_LENGTH 1
WARNING:RECORD_MISSING_READ_GROUP 54
WARNING:MISSING_TAG_NM 33
+

This table, generated by ValidateSamFile from a real BAM file, indicates that this file has a total of 1 MISSING_READ_GROUP error, 4 MISMATCH_MATE_ALIGNMENT_START errors, 894,289 MATES_ARE_SAME_END errors, and so on. Moreover, this output also indicates that there are 54 RECORD_MISSING_READ_GROUP warnings and 33 MISSING_TAG_NM warnings.

+

2. Generate detailed list of ERROR records

+

Since ERRORs are more severe than WARNINGs, we focus on diagnosing and fixing them first. From the first step we only had a summary of errors, so now we generate a more detailed report with this command:

+
$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        IGNORE_WARNINGS=true \
+        MODE=VERBOSE 
+

Note that we invoked the MODE=VERBOSE and the IGNORE_WARNINGS=true arguments.

+

The former is technically not necessary as VERBOSE is the tool's default mode, but we specify it here to make it clear that that's the behavior we want. This produces a complete list of every problematic record, as well as a more descriptive explanation for each type of ERROR than is given in the SUMMARY output.

+

The IGNORE_WARNINGS option enables us to specifically examine only the records with ERRORs. When working with large files, this feature can be quite helpful, because there may be many records with WARNINGs that are not immediately important, and we don't want them flooding the log output.

+

Example of VERBOSE report for ERRORs only

+ + + + + + + + + +
ValidateSamFile (VERBOSE) Error Description
ERROR: Read groups is empty Empty read group field for multiple records
ERROR: Record 1, Read name 20FUKAAXX100202:6:27:4968:125377Mate alignment does not match alignment start of mate
ERROR: Record 3, Read name 20FUKAAXX100202:6:27:4986:125375 Both mates are marked as second of pair
ERROR: Record 6, Read name 20GAVAAXX100126:4:47:18102:194445 Read CIGAR M operator maps off end of reference
ERROR: Read name 30PPJAAXX090125:1:60:1109:517#0 Mate not found for paired read
ERROR: Record 402, Read name 20GAVAAXX100126:3:44:17022:23968 Mate unmapped flag does not match read unmapped flag of mate
ERROR: Record 12, Read name HWI-ST1041:151:C7BJEACXX:1:1101:1128:82805 Read length does not match quals length
+

These ERRORs are all problems that we must address before using this BAM file as input for further analysis. Most ERRORs can typically be fixed using Picard tools to either correct the formatting or fill in missing information, although sometimes you may want to simply filter out malformed reads using Samtools.

+

For example, MISSING_READ_GROUP errors can be solved by adding the read group information to your data using the AddOrReplaceReadGroups tool. Most mate pair information errors can be fixed with FixMateInformation.

+

Once you have attempted to fix the errors in your file, you should put your new SAM/BAM file through the first validation step in the workflow, running ValidateSamFile in SUMMARY mode again. We do this to evaluate whether our attempted fix has solved the original ERRORs, and/or any of the original WARNINGs, and/or introduced any new ERRORs or WARNINGs (sadly, this does happen).

+

If you still have ERRORs, you'll have to loop through this part of the workflow until no more ERRORs are detected.

+

If you have no more ERRORs, congratulations! It's time to look at the WARNINGs (assuming there are still some -- if not, you're off to the races).

+

3. Generate detailed list of WARNING records

+

To obtain more detailed information about the warnings, we invoke the following command:

+
$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        IGNORE=type \
+        MODE=VERBOSE 
+

At this time we often use the IGNORE option to tell the program to ignore a specific type of WARNING that we consider less important, in order to focus on the rest. In some cases we may even decide to not try to address some WARNINGs at all because we know they are harmless (for example, MATE_NOT_FOUND warnings are expected when working with a small snippet of data). But in general we do strongly recommend that you address all of them to avoid any downstream complications, unless you're sure you know what you're doing.

+

Example of VERBOSE report for WARNINGs only

+ + + + +
ValidateSamFile (VERBOSE) Warning Description
WARNING: Read name H0164ALXX140820:2:1204:13829:66057 A record is missing a read group
WARNING: Record 1, Read name HARMONIA-H16:1253:0:7:1208:15900:108776 NM tag (nucleotide differences) is missing
+

Here we see a read group-related WARNING which would probably be fixed when we fix the MISSING_READ_GROUP error we encountered earlier, hence the prioritization strategy of tackling ERRORs first and WARNINGs second.

+

We also see a WARNING about missing NM tags. This is an alignment tag that is added by some but not all genome aligners, and is not used by the downstream tools that we care about, so you may decide to ignore this warning by adding IGNORE=MISSING_TAG_NM from now on when you run ValidateSamFile on this file.

+

Once you have attempted to fix all the WARNINGs that you care about in your file, you put your new SAM/BAM file through the first validation step in the workflow again, running ValidateSamFile in SUMMARY mode. Again, we check that no new ERRORs have been introduced and that the only WARNINGs that remain are the ones we feel comfortable ignoring. If that's not the case we run through the workflow again. If it's all good, we can proceed with our analysis.

+
+

Appendix: List of all WARNINGs and ERRORs emitted by ValidateSamFile

+

We are currently in the process of updating the Picard website to include the following two tables, describing WARNING (Table I) and ERROR (Table II) cases. Until that's done, you can find them here.

+
+ + + + + + + + + + + + + + + + +
Table I
WARNINGDescription
Header Issues
INVALID_DATE_STRINGDate string is not ISO-8601
INVALID_QUALITY_FORMAT Quality encodings out of range; appear to be Solexa or Illumina when Phred expected. Avoid exception being thrown as a result of no qualities being read.
General Alignment Record Issues
ADJACENT_INDEL_IN_CIGAR CIGAR string contains an insertion (I) followed by deletion (D), or vice versa
RECORD_MISSING_READ_GROUP A SAMRecord is found with no read group id
Mate Pair Issues
PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND Pair flag set but not marked as first or second of pair
Optional Alignment Tag Issues
MISSING_TAG_NM The NM tag (nucleotide differences) is missing
E2_BASE_EQUALS_PRIMARY_BASE Secondary base calls should not be the same as primary, unless one or the other is N
General File, Index or Sequence Dictionary Issues
BAM_FILE_MISSING_TERMINATOR_BLOCK BAM appears to be healthy, but is an older file so doesn't have terminator block
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Table II
ERROR Description
Header Issues
DUPLICATE_PROGRAM_GROUP_ID Same program group id appears more than once
DUPLICATE_READ_GROUP_ID Same read group id appears more than once
HEADER_RECORD_MISSING_REQUIRED_TAG Header tag missing in header line
HEADER_TAG_MULTIPLY_DEFINED Header tag appears more than once in header line with different value
INVALID_PLATFORM_VALUE The read group has an invalid value set for its PL field
INVALID_VERSION_NUMBER Does not match any of the acceptable versions
MISSING_HEADER The SAM/BAM file is missing the header
MISSING_PLATFORM_VALUE The read group is missing its PL (platform unit) field
MISSING_READ_GROUP The header is missing read group information
MISSING_SEQUENCE_DICTIONARY There is no sequence dictionary in the header
MISSING_VERSION_NUMBER Header has no version number
POORLY_FORMATTED_HEADER_TAG Header tag does not have colon
READ_GROUP_NOT_FOUND A read group ID on a SAMRecord is not found in the header
UNRECOGNIZED_HEADER_TYPE Header record is not one of the standard types
General Alignment Record Issues
CIGAR_MAPS_OFF_REFERENCE Bases corresponding to M operator in CIGAR extend beyond reference
INVALID_ALIGNMENT_START Alignment start position is incorrect
INVALID_CIGAR CIGAR string error for either read or mate
INVALID_FLAG_FIRST_OF_PAIR First of pair flag set for unpaired read
INVALID_FLAG_SECOND_OF_PAIR Second of pair flag set for unpaired read
INVALID_FLAG_PROPER_PAIR Proper pair flag set for unpaired read
INVALID_FLAG_MATE_NEG_STRAND Mate negative strand flag set for unpaired read
INVALID_FLAG_NOT_PRIM_ALIGNMENT Not primary alignment flag set for unmapped read
INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT Supplementary alignment flag set for unmapped read
INVALID_FLAG_READ_UNMAPPED Mapped read flat not set for mapped read
INVALID_INSERT_SIZE Inferred insert size is out of range
INVALID_MAPPING_QUALITY Mapping quality set for unmapped read or is >= 256
INVALID_PREDICTED_MEDIAN_INSERT_SIZE PI tag value is not numeric
MISMATCH_READ_LENGTH_AND_QUALS_LENGTH Length of sequence string and length of base quality string do not match
TAG_VALUE_TOO_LARGE Unsigned integer tag value is deprecated in BAM. Template length
Mate Pair Issues
INVALID_FLAG_MATE_UNMAPPED Mate unmapped flag is incorrectly set
MATE_NOT_FOUND Read is marked as paired, but its pair was not found
MATE_CIGAR_STRING_INVALID_PRESENCE A cigar string for a read whose mate is NOT mapped
MATE_FIELD_MISMATCH Read alignment fields do not match its mate
MATES_ARE_SAME_END Both mates of a pair are marked either as first or second mates
MISMATCH_FLAG_MATE_UNMAPPED Mate unmapped flag does not match read unmapped flag of mate
MISMATCH_FLAG_MATE_NEG_STRAND Mate negative strand flag does not match read strand flag
MISMATCH_MATE_ALIGNMENT_START Mate alignment does not match alignment start of mate
MISMATCH_MATE_CIGAR_STRING The mate cigar tag does not match its mate's cigar string
MISMATCH_MATE_REF_INDEX Mate reference index (MRNM) does not match reference index of mate
Optional Alignment Tag Issues
INVALID_MATE_REF_INDEX Mate reference index (MRNM) set for unpaired read
INVALID_TAG_NM The NM tag (nucleotide differences) is incorrect
MISMATCH_READ_LENGTH_AND_E2_LENGTH Lengths of secondary base calls tag values and read should match
MISMATCH_READ_LENGTH_AND_U2_LENGTH Secondary base quals tag values should match read length
EMPTY_READ Indicates that a read corresponding to the first strand has a length of zero and/or lacks flow signal intensities (FZ)
INVALID_INDEXING_BIN Indexing bin set on SAMRecord does not agree with computed value
General File, Index or Sequence Dictionary Issues
INVALID_INDEX_FILE_POINTER Invalid virtualFilePointer in index
INVALID_REFERENCE_INDEX Reference index not found in sequence dictionary
RECORD_OUT_OF_ORDER The record is out of order
TRUNCATED_FILE BAM file does not have terminator block
+
\ No newline at end of file diff --git a/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md b/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md new file mode 100644 index 000000000..0b3637f9b --- /dev/null +++ b/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md @@ -0,0 +1,55 @@ +## I am unable to use VQSR (recalibration) to filter variants + +http://gatkforums.broadinstitute.org/gatk/discussion/3225/i-am-unable-to-use-vqsr-recalibration-to-filter-variants + +

The problem:

+

Our preferred method for filtering variants after the calling step is to use VQSR, a.k.a. recalibration. However, it requires well-curated training/truth resources, which are typically not available for organisms other than humans, and it also requires a large amount of variant sites to operate properly, so it is not suitable for some small-scale experiments such as targeted gene panels or exome studies with fewer than 30 exomes. For the latter, it is sometimes possible to pad your cohort with exomes from another study (especially for humans -- use 1000 Genomes or ExAC!) but again for non-human organisms it is often not possible to do this.

+
+

The solution: hard-filtering

+

So, if this is your case and you are sure that you cannot use VQSR, then you will need to use the VariantFiltration tool to hard-filter your variants. To do this, you will need to compose filter expressions using JEXL as explained here based on the generic filter recommendations detailed below. There is a tutorial that shows how to achieve this step by step. Be sure to also read the documentation explaining how to understand and improve upon the generic hard filtering recommendations.

+
+

But first, some caveats

+

Let's be painfully clear about this: there is no magic formula that will give you perfect results. Filtering variants manually, using thresholds on annotation values, is subject to all sorts of caveats. The appropriateness of both the annotations and the threshold values is very highly dependent on the specific callset, how it was called, what the data was like, what organism it belongs to, etc.

+

HOWEVER, because we want to help and people always say that something is better than nothing (not necessarily true, but let's go with that for now), we have formulated some generic recommendations that should at least provide a starting point for people to experiment with their data.

+

In case you didn't catch that bit in bold there, we're saying that you absolutely SHOULD NOT expect to run these commands and be done with your analysis. You absolutely SHOULD expect to have to evaluate your results critically and TRY AGAIN with some parameter adjustments until you find the settings that are right for your data.

+

In addition, please note that these recommendations are mainly designed for dealing with very small data sets (in terms of both number of samples or size of targeted regions). If you are not using VQSR because you do not have training/truth resources available for your organism, then you should expect to have to do even more tweaking on the filtering parameters.

+
+

Filtering recommendations

+

Here are some recommended arguments to use with VariantFiltration when ALL other options are unavailable to you. Be sure to read the documentation explaining how to understand and improve upon these recommendations.

+

Note that these JEXL expressions will tag as filtered any sites where the annotation value matches the expression. So if you use the expression QD < 2.0, any site with a QD lower than 2 will be tagged as failing that filter.

+

For SNPs:

+ +

If your callset was generated with UnifiedGenotyper for legacy reasons, you can add HaplotypeScore > 13.0.

+

For indels:

+ +
+

And now some more IMPORTANT caveats (don't skip this!)

+ +
+

Finally, a note of hope

+

Some bits of this article may seem harsh, or depressing. Sorry. We believe in giving you the cold hard truth.

+

HOWEVER, we do understand that this is one of the major points of pain that GATK users encounter -- along with understanding how VQSR works, so really, whichever option you go with, you're going to suffer.

+

And we do genuinely want to help. So although we can't look at every single person's callset and give an opinion on how it looks (no, seriously, don't ask us to do that), we do want to hear from you about how we can best help you help yourself. What information do you feel would help you make informed decisions about how to set parameters? Are the meanings of the annotations not clear? Would knowing more about how they are computed help you understand how you can use them? Do you want more math? Less math, more concrete examples?

+

Tell us what you'd like to see here, and we'll do our best to make it happen. (no unicorns though, we're out of stock)

+

We also welcome testimonials from you. We are one small team; you are a legion of analysts all trying different things. Please feel free to come forward and share your findings on what works particularly well in your hands.

\ No newline at end of file diff --git a/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md b/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md new file mode 100644 index 000000000..5d2170c40 --- /dev/null +++ b/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md @@ -0,0 +1,27 @@ +## I do not get the annotations I specified with -A + +http://gatkforums.broadinstitute.org/gatk/discussion/6022/i-do-not-get-the-annotations-i-specified-with-a + +

The problem

+

You specified -A <some annotation> in a command line invoking one of the annotation-capable tools (HaplotypeCaller, MuTect2, UnifiedGenotyper and VariantAnnotator), but that annotation did not show up in your output VCF.

+

Keep in mind that all annotations that are necessary to run our Best Practices are annotated by default, so you should generally not need to request annotations unless you're doing something a bit special.

+

Why this happens & solutions

+

There can be several reasons why this happens, depending on the tool, the annotation, and you data. These are the four we see most often; if you encounter another that is not listed here, let us know in the comments.

+
    +
  1. +

    You requested an annotation that cannot be calculated by the tool

    +

    For example, you're running MuTect2 but requested an annotation that is specific to HaplotypeCaller. There should be an error message to that effect in the output log. It's not possible to override this; but if you believe the annotation should be available to the tool, let us know in the forum and we'll consider putting in a feature request.

    +
  2. +
  3. +

    You requested an annotation that can only be calculated if an optional input is provided

    +

    For example, you're running HaplotypeCaller and you want InbreedingCoefficient, but you didn't specify a pedigree file. There should be an error message to that effect in the output log. The solution is simply to provide the missing input file. Another example: you're running VariantAnnotator and you want to annotate Coverage, but you didn't specify a BAM file. The tool needs to see the read data in order to calculate the annotation, so again, you simply need to provide the BAM file.

    +
  4. +
  5. +

    You requested an annotation that has requirements which are not met by some or all sites

    +

    For example, you're looking at RankSumTest annotations, which require heterozygous sites in order to perform the necessary calculations, but you're running on haploid data so you don't have any het sites. There is no workaround; the annotation is not applicable to your data. Another example: you requested InbreedingCoefficient, but your population includes fewer than 10 founder samples, which are required for the annotation calculation. There is no workaround; the annotation is not applicable to your data.

    +
  6. +
  7. +

    You requested an annotation that is already applied by default by the tool you are running

    +

    For example, you requested Coverage from HaplotypeCaller, which already annotates this by default. There is currently a bug that causes some default annotations to be dropped from the list if specified on the command line. This will be addressed in an upcoming version. For now the workaround is to check what annotations are applied by default and NOT request them with -A.

    +
  8. +
\ No newline at end of file diff --git a/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md b/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md new file mode 100644 index 000000000..54f3c6dd5 --- /dev/null +++ b/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md @@ -0,0 +1,32 @@ +## I expect to see a variant at a specific site, but it's not getting called + +http://gatkforums.broadinstitute.org/gatk/discussion/1235/i-expect-to-see-a-variant-at-a-specific-site-but-its-not-getting-called + +

This can happen when you expect a call to be made based on the output of other variant calling tools, or based on examination of the data in a genome browser like IGV.

+

There are several possibilities, and among them, it is possible that GATK may be missing a real variant. But we are generally very confident in the calculations made by our tools, and in our experience, most of the time, the problem lies elsewhere. So, before you post this issue in our support forum, please follow these troubleshooting guidelines, which hopefully will help you figure out what's going on.

+

In all cases, to diagnose what is happening, you will need to look directly at the sequencing data at the position in question.

+

1. Generate the bamout and compare it to the input bam

+

If you are using HaplotypeCaller to call your variants (as you nearly always should) you'll need to run an extra step first to produce a file called the "bamout file". See this tutorial for step-by-step instructions on how to do this.

+

What often happens is that when you look at the reads in the original bam file, it looks like a variant should be called. However, once HaplotypeCaller has performed the realignment, the reads may no longer support the expected variant. Generating the bamout file and comparing it to the original bam will allow you to elucidate such cases.

+

In the example below, you see the original bam file on the top, and on the bottom is the bam file after reassembly. In this case, there seem to be many SNPs present, however, after reassembly, we find there is really a large deletion!

+

+

2. Check the base qualities of the non-reference bases

+

The variant callers apply a minimum base quality threshold, under which bases will not be counted as supporting evidence for a variant. This is because low base qualities mean that the sequencing machine was not confident that it called the right bases. If your expected variant is only supported by low-confidence bases, it is probably a false positive.

+

Keep in mind that the depth reported in the DP field of the VCF is the unfiltered depth. You may believe you have good coverage at your site of interest, but since the variant callers ignore bases that fail the quality filters, the actual coverage seen by the variant callers may be lower than you think.

+

3. Check the mapping qualities of the reads that support the non-reference allele(s)

+

The quality of a base is capped by the mapping quality of the read that it is on. This is because low mapping qualities mean that the aligner had little confidence that the read was mapped to the correct location in the genome. You may be seeing mismatches because the read doesn't belong there -- in fact, you may be looking at the sequence of some other locus in the genome!

+

Keep in mind also that reads with mapping quality 255 ("unknown") are ignored.

+

4. Check how many alternate alleles are present

+

By default the variant callers will only consider a certain number of alternate alleles. This parameter can be relaxed using the --max_alternate_alleles argument (see the HaplotypeCaller documentation page to find out what is the default value for this argument). Note however that genotyping sites with many alternate alleles increases the computational cost of the processing, scaling exponentially with the number of alternate alleles, which means it will use more resources and take longer. Unless you have a really good reason to change the default value, we highly recommend that you not modify this parameter.

+

5. When using UnifiedGenotyper, check for overlapping deletions

+

The UnifiedGenotyper ignores sites if there are too many overlapping deletions. This parameter can be relaxed using the --max_deletion_fraction argument (see the UG's documentation page to find out what is the default value for this argument) but be aware that increasing its value could adversely affect the reliability of your results.

+

6. Check for systematic biases introduced by your sequencing technology

+

Some sequencing technologies introduce particular sources of bias. For example, +in data produced by the SOLiD platform, alignments tend to have reference bias and it can be severe in some cases. If the SOLiD reads have a lot of mismatches (no-calls count as mismatches) around the the site, you are probably seeing false positives.

+

7. Try fiddling with graph arguments (ADVANCED)

+

This is highly experimental, but if all else fails, worth a shot (with HaplotypeCaller and MuTect2).

+

Fiddle with kmers

+

In some difficult sequence contexts (e.g. repeat regions), when some default-sized kmers are non-unique, cycles get generated in the graph. By default the program increases the kmer size automatically to try again, but after several attempts it will eventually quit trying and fail to call the expected variant (typically because the variant gets pruned out of the read-threading assembly graph, and is therefore never assembled into a candidate haplotype). We've seen cases where it's still possible to force a resolution using -allowNonUniqueKmersInRef and/or increasing the --kmerSize (or range of permitted sizes: 10, 25, 35 for example).

+
Note: While --allowNonUniqueKmersInRef allows missed calls to be made in repeat regions, it should not be used in all regions as it may increase false positives. We have plans to improve variant calling in repeat regions, but for now please try this flag if you notice calls being missed in repeat regions.
+

Fiddle with pruning

+

Decreasing the value of -minPruning and/or -minDanglingBranchLength (i.e. increasing the amount of evidence necessary to keep a path in the graph) can recover variants, at the risk of taking on more false positives.

\ No newline at end of file diff --git a/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md b/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md new file mode 100644 index 000000000..6497998f4 --- /dev/null +++ b/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md @@ -0,0 +1,11 @@ +## I need to run programs that require different versions of Java + +http://gatkforums.broadinstitute.org/gatk/discussion/6841/i-need-to-run-programs-that-require-different-versions-of-java + +

We sometimes need to be able to use multiple versions of Java on the same computer to run command-line tools that have different version requirements. At the time of writing, GATK requires an older version of Java (1.7), whereas Picard requires the most recent version (1.8). So in order to run both Picard tools and GATK tools on your computer, we present a solution for doing so that is reasonably painless.

+

You will need to have both versions of Java installed on your machine. The Java installation package for 1.8 can be found here, and the package for 1.7 is here. Note that we point to the “JDK” (Java Development Kit) packages because they are the most complete Java packages (suitable for developing in Java as well as running Java executables), and we have had reports that the “JRE” (Java Runtime Environment) equivalents were not sufficient to run GATK on some machines.

+

First, check your current default java version by opening your terminal and typing java -version. If the version starts with “1.8”, you will need to add the following code to the beginning of your GATK command to specify that it should be run using version 1.7.

+ +

If your default version starts with “1.7”, then you will need to prepend the code below to your Picard command:

+ +

You may need to change the orange part in each code snippet, which should refer to the specific version of java you have installed on your machine (version and update). To find that, simply navigate to the folder where you had installed the JDK. Under the “JavaVirtualMachines” folder, you should find JDK folders that name the specific version and update.

\ No newline at end of file diff --git a/doc_archive/queue/Frequently_asked_questions_about_QScripts.md b/doc_archive/queue/Frequently_asked_questions_about_QScripts.md new file mode 100644 index 000000000..535708e29 --- /dev/null +++ b/doc_archive/queue/Frequently_asked_questions_about_QScripts.md @@ -0,0 +1,95 @@ +## Frequently asked questions about QScripts + +http://gatkforums.broadinstitute.org/gatk/discussion/1314/frequently-asked-questions-about-qscripts + +

1. Many of my GATK functions are setup with the same Reference, Intervals, etc. Is there a quick way to reuse these values for the different analyses in my pipeline?

+

Yes.

+ +

For more information, see the ExampleUnifiedGenotyper.scala or examples of using Scala's traits/mixins illustrated in the QScripts documentation.

+

2. How do I accept a list of arguments to my QScript?

+

In your QScript, define a var list and annotate it with @Argument. Initialize the value to Nil.

+
@Argument(doc="filter names", shortName="filter")
+var filterNames: List[String] = Nil
+

On the command line specify the arguments by repeating the argument name.

+
-filter filter1 -filter filter2 -filter filter3
+

Then once your QScript is run, the command line arguments will be available for use in the QScript's script method.

+
  def script {
+     var myCommand = new MyFunction
+     myCommand.filters = this.filterNames
+  }
+

For a full example of command line arguments see the QScripts documentation.

+

3. What is the best way to run a utility method at the right time?

+

Wrap the utility with an InProcessFunction. If your functionality is reusable code you should add it to Sting Utils with Unit Tests and then invoke your new function from your InProcessFunction. Computationally or memory intensive functions should NOT be implemented as InProcessFunctions, and should be wrapped in Queue CommandLineFunctions instead.

+
    class MySplitter extends InProcessFunction {
+      @Input(doc="inputs")
+      var in: File = _
+
+      @Output(doc="outputs")
+      var out: List[File] = Nil
+
+      def run {
+         StingUtilityMethod.quickSplitFile(in, out)
+      }
+    }
+
+    var splitter = new MySplitter
+    splitter.in = new File("input.txt")
+    splitter.out = List(new File("out1.txt"), new File("out2.txt"))
+    add(splitter)
+

See Queue CommandLineFunctions for more information on how @Input and @Output are used.

+

4. What is the best way to write a list of files?

+

Create an instance of a ListWriterFunction and add it in your script method.

+
import org.broadinstitute.sting.queue.function.ListWriterFunction
+
+val writeBamList = new ListWriterFunction
+writeBamList.inputFiles = bamFiles
+writeBamList.listFile = new File("myBams.list")
+add(writeBamList)
+

5. How do I add optional debug output to my QScript?

+

Queue contains a trait mixin you can use to add Log4J support to your classes.

+

Add the import for the trait Logging to your QScript.

+
import org.broadinstitute.sting.queue.util.Logging
+

Mixin the trait to your class.

+
class MyScript extends Logging {
+...
+

Then use the mixed in logger to write debug output when the user specifies -l DEBUG.

+
logger.debug("This will only be displayed when debugging is enabled.")
+

6. I updated Queue and now I'm getting java.lang.NoClassDefFoundError / java.lang.AbstractMethodError

+

Try ant clean.

+

Queue relies on a lot of Scala traits / mixins. These dependencies are not always picked up by the scala/java compilers leading to partially implemented classes. If that doesn't work please let us know in the forum.

+

7. Do I need to create directories in my QScript?

+

No. QScript will create all parent directories for outputs.

+

8. How do I specify the -W 240 for the LSF hour queue at the Broad?

+

Queue's LSF dispatcher automatically looks up and sets the maximum runtime for whichever LSF queue is specified. If you set your -jobQueue/.jobQueue to hour then you should see something like this under bjobs -l:

+
RUNLIMIT
+240.0 min of gsa3
+

9. Can I run Queue with GridEngine?

+

Queue GridEngine functionality is community supported. See here for full details: Queue with Grid Engine.

+

10. How do I pass advanced java arguments to my GATK commands, such as remote debugging?

+

The easiest way to do this at the moment is to mixin a trait.

+

First define a trait which adds your java options:

+
  trait RemoteDebugging extends JavaCommandLineFunction {
+    override def javaOpts = super.javaOpts + " -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"
+  }
+

Then mix in the trait to your walker and otherwise run it as normal:

+
  val printReadsDebug = new PrintReads with RemoteDebugging
+  printReadsDebug.reference_sequence = "my.fasta"
+  // continue setting up your walker...
+  add(printReadsDebug)
+

11. Why does Queue log "Running jobs. ... Done." but doesn't actually run anything?

+

If you see something like the following, it means that Queue believes that it previously successfully generated all of the outputs.

+
INFO 16:25:55,049 QCommandLine - Scripting ExampleUnifiedGenotyper 
+INFO 16:25:55,140 QCommandLine - Added 4 functions 
+INFO 16:25:55,140 QGraph - Generating graph. 
+INFO 16:25:55,164 QGraph - Generating scatter gather jobs. 
+INFO 16:25:55,714 QGraph - Removing original jobs. 
+INFO 16:25:55,716 QGraph - Adding scatter gather jobs. 
+INFO 16:25:55,779 QGraph - Regenerating graph. 
+INFO 16:25:55,790 QGraph - Running jobs. 
+INFO 16:25:55,853 QGraph - 0 Pend, 0 Run, 0 Fail, 10 Done 
+INFO 16:25:55,902 QCommandLine - Done 
+

Queue will not re-run the job if a .done file is found for the all the outputs, e.g.: /path/to/.output.file.done. You can either remove the specific .done files yourself, or use the -startFromScratch command line option.

\ No newline at end of file diff --git a/doc_archive/queue/Overview_of_Queue.md b/doc_archive/queue/Overview_of_Queue.md new file mode 100644 index 000000000..5b36da7ef --- /dev/null +++ b/doc_archive/queue/Overview_of_Queue.md @@ -0,0 +1,94 @@ +## Overview of Queue + +http://gatkforums.broadinstitute.org/gatk/discussion/1306/overview-of-queue + +

1. Introduction

+

GATK-Queue is command-line scripting framework for defining multi-stage genomic analysis pipelines combined with an execution manager that runs those pipelines from end-to-end. Often processing genome data includes several steps to produces outputs, for example our BAM to VCF calling pipeline include among other things:

+ +

Running these tools one by one in series may often take weeks for processing, or would require custom scripting to try and optimize using parallel resources.

+

With a Queue script users can semantically define the multiple steps of the pipeline and then hand off the logistics of running the pipeline to completion. Queue runs independent jobs in parallel, handles transient errors, and uses various techniques such as running multiple copies of the same program on different portions of the genome to produce outputs faster.

+
+

2. Obtaining Queue

+

You have two options: download the binary distribution (prepackaged, ready to run program) or build it from source.

+

- Download the binary

+

This is obviously the easiest way to go. Links are on the Downloads page. Just get the Queue package; no need to get the GATK package separately as GATK is bundled in with Queue.

+

- Building Queue from source

+

Briefly, here's what you need to know/do:

+

Queue is part of the GATK repository. Download the source from the public repository on Github. Run the following command:

+
git clone https://github.com/broadgsa/gatk.git
+

IMPORTANT NOTE: These instructions refer to the MIT-licensed version of the GATK+Queue source code. With that version, you will be able to build Queue itself, as well as the public portion of the GATK (the core framework), but that will not include the GATK analysis tools. If you want to use Queue to pipeline the GATK analysis tools, you need to clone the 'protected' repository. Please note however that part of the source code in that repository (the 'protected' module) is under a different license which excludes for-profit use, modification and redistribution.

+

Move to the git root directory and use maven to build the source.

+
mvn clean verify
+

All dependencies will be managed by Maven as needed.

+

See this article on how to test your installation of Queue.

+
+

3. Running Queue

+

See this article on running Queue for the first time for full details.

+

Queue arguments can be listed by running with --help

+
java -jar dist/Queue.jar --help
+

To list the arguments required by a QScript, add the script with -S and run with --help.

+
java -jar dist/Queue.jar -S script.scala --help
+

Note that by default queue runs in a "dry" mode, as explained in the link above. After verifying the generated commands execute the pipeline by adding -run.

+

See QFunction and Command Line Options for more info on adjusting Queue options.

+

4. QScripts

+

General Information

+

Queue pipelines are written as Scala 2.8 files with a bit of syntactic sugar, called QScripts.

+

Every QScript includes the following steps:

+ +

The basic command-line to run the Queue pipelines on the command line is

+
java -jar Queue.jar -S <script>.scala
+

See the main article Queue QScripts for more info on QScripts.

+

Supported QScripts

+

Most QScripts are analysis pipelines that are custom-built for specific projects, and we currently do not offer any QScripts as supported analysis tools. However, we do provide some example scripts that you can use as basis to write your own QScripts (see below).

+

Example QScripts

+

The latest version of the example files are available in the Sting github repository under public/scala/qscript/examples

+
+

5. Visualization and Queue

+

QJobReport

+

Queue automatically generates GATKReport-formatted runtime information about executed jobs. See this presentation for a general introduction to QJobReport.

+

Note that Queue attempts to generate a standard visualization using an R script in the GATK public/R repository. You must provide a path to this location if you want the script to run automatically. Additionally the script requires the gsalib to be installed on the machine, which is typically done by providing its path in your .Rprofile file:

+
bm8da-dbe ~/Desktop/broadLocal/GATK/unstable % cat ~/.Rprofile
+.libPaths("/Users/depristo/Desktop/broadLocal/GATK/unstable/public/R/")
+

Note that gsalib is available from the CRAN repository so you can install it with the canonical R package install command.

+

Caveats

+ +

DOT visualization of Pipelines

+

Queue emits a queue.dot file to help visualize your commands. You can open this file in programs like DOT, OmniGraffle, etc to view your pipelines. By default the system will print out your LSF command lines, but this can be too much in a complex pipeline.

+

To clarify your pipeline, override the dotString() function:

+
class CountCovariates(bamIn: File, recalDataIn: File, args: String = "") extends GatkFunction {
+    @Input(doc="foo") var bam = bamIn
+    @Input(doc="foo") var bamIndex = bai(bamIn)
+    @Output(doc="foo") var recalData = recalDataIn
+    memoryLimit = Some(4)
+    override def dotString = "CountCovariates: %s [args %s]".format(bamIn.getName, args)
+    def commandLine = gatkCommandLine("CountCovariates") + args + " -l INFO -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -I %s --max_reads_at_locus 20000 -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -recalFile %s".format(bam, recalData)
+}
+

Here we only see CountCovariates my.bam [-OQ], for example, in the dot file. The base quality score recalibration pipeline, as visualized by DOT, can be viewed here:

+

6. Further reading

+ \ No newline at end of file diff --git a/doc_archive/queue/Pipelining_the_GATK_with_Queue.md b/doc_archive/queue/Pipelining_the_GATK_with_Queue.md new file mode 100644 index 000000000..9801721f3 --- /dev/null +++ b/doc_archive/queue/Pipelining_the_GATK_with_Queue.md @@ -0,0 +1,188 @@ +## Pipelining the GATK with Queue + +http://gatkforums.broadinstitute.org/gatk/discussion/1310/pipelining-the-gatk-with-queue + +

1. Introduction

+

As mentioned in the introductory materials, the core concept behind the GATK tools is the walker. The Queue scripting framework contains several mechanisms which make it easy to chain together GATK walkers.

+

2. Authoring walkers

+

As part of authoring your walker there are several Queue behaviors that you can specify for [QScript]() authors using your particular walker.

+

Specifying how to partition

+

Queue can significantly speed up generating walker outputs by passing different instances of the GATK the same BAM or VCF data but specifying different regions of the data to analyze. After the different instances output their individual results Queue will gather the results back to the original output path requested by QScript.

+

Queue limits the level it will split genomic data by examining the @PartitionBy() annotation for your walker which specifies a PartitionType. This table lists the different partition types along with the default partition level for each of the different walker types.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PartitionTypeDefault for Walker TypeDescriptionExample IntervalsExample Splits
PartitionType.CONTIGRead walkersData is grouped together so that all genomic data from the same contig is never presented to two different instances of the GATK.original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11split 1: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60; split 2:chr3:10-11
PartitionType.INTERVAL(none)Data is split down to the interval level but never divides up an explicitly specified interval. If no explicit intervals are specified in the QScript for the GATK then this is effectively the same as splitting by contig.original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11split 1: chr1:10-11, chr2:10-20, chr2:30-40; split 2: chr2:50-60, chr3:10-11
PartitionType.LOCUSLocus walkers, ROD walkersData is split down to the locus level possibly dividing up intervals.original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11split 1: chr1:10-11, chr2:10-20, chr2:30-35; split 2: chr2:36-40, chr2:50-60, chr3:10-11
PartitionType.NONERead pair walkers, Duplicate walkersThe data cannot be split and Queue must run the single instance of the GATK as specified in the QScript.original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11no split: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11
+

If you walker is implemented in a way that Queue should not divide up your data you should explicitly set the @PartitionBy(PartitionType.NONE). If your walker can theoretically be run per genome location specify @PartitionBy(PartitionType.LOCUS).

+
@PartitionBy(PartitionType.LOCUS)
+public class ExampleWalker extends LocusWalker<Integer, Integer> {
+...
+

Specifying how to join outputs

+

Queue will join the standard walker outputs.

+ + + + + + + + + + + + + + + + + + + + + +
Output typeDefault gatherer implementation
SAMFileWriterThe BAM files are joined together using Picard's MergeSamFiles.
VCFWriterThe VCF files are joined together using the GATK CombineVariants.
PrintStreamThe first two files are scanned for a common header. The header is written once into the output, and then each file is appended to the output, skipping past with the header lines.
+

If your PrintStream is not a simple text file that can be concatenated together, you must implement a Gatherer. Extend your custom Gatherer from the abstract base class and implement the gather() method.

+
package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Combines a list of files into a single output.
+ */
+public abstract class Gatherer {
+    /**
+     * Gathers a list of files into a single output.
+     * @param inputs Files to combine.
+     * @param output Path to output file.
+     */
+    public abstract void gather(List<File> inputs, File output);
+
+    /**
+     * Returns true if the caller should wait for the input files to propagate over NFS before running gather().
+     */
+    public boolean waitForInputs() { return true; }
+}
+

Specify your gatherer using the @Gather() annotation by your @Output.

+
@Output
+@Gather(MyGatherer.class)
+public PrintStream out;
+

Queue will run your custom gatherer to join the intermediate outputs together.

+

3. Using GATK walkers in Queue

+

Queue GATK Extensions

+

Running 'ant queue' builds a set of Queue extensions for the GATK-Engine. Every GATK walker and command line program in the compiled GenomeAnalysisTK.jar a Queue compatible wrapper is generated.

+

The extensions can be imported via import org.broadinstitute.sting.queue.extensions.gatk._

+
import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.extensions.gatk._
+
+class MyQscript extends QScript {
+...
+

Note that the generated GATK extensions will automatically handle shell-escaping of all values assigned to the various Walker parameters, so you can rest assured that all of your values will be taken literally by the shell. Do not attempt to escape values yourself -- ie.,

+

Do this:

+
filterSNPs.filterExpression = List("QD<2.0", "MQ<40.0", "HaplotypeScore>13.0")
+

NOT this:

+
filterSNPs.filterExpression = List("\"QD<2.0\"", "\"MQ<40.0\"", "\"HaplotypeScore>13.0\"")
+

Listing variables

+

In addition to the GATK documentation on this wiki you can also find the full list of arguments for each walker extension in a variety of ways.

+

The source code for the extensions is generated during ant queue and placed in this directory:

+
build/queue-extensions/src
+

When properly configured an IDE can provide command completion of the walker extensions. See Queue with IntelliJ IDEA for our recommended settings.

+

If you do not have access to an IDE you can still find the names of the generated variables using the command line. The generated variable names on each extension are based off of the fullName of the Walker argument. To see the built in documentation for each Walker, run the GATK with:

+
java -jar GenomeAnalysisTK.jar -T <walker name> -help
+

Once the import statement is specified you can add() instances of gatk extensions in your QScript's script() method.

+

Setting variables

+

If the GATK walker input allows more than one of a value you should specify the values as a List().

+
  def script() {
+    val snps = new UnifiedGenotyper
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }
+

Although it may be harder for others trying to read your QScript, for each of the long name arguments the extensions contain aliases to their short names as well.

+
  def script() {
+    val snps = new UnifiedGenotyper
+    snps.R = new File("testdata/exampleFASTA.fasta")
+    snps.I = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }
+

Here are a few more examples using various list assignment operators.

+
  def script() {
+    val countCovariates = new CountCovariates
+
+    // Append to list using item appender :+
+    countCovariates.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+
+    // Append to list using collection appender ++
+    countCovariates.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
+
+    // Assign list using plain old object assignment
+    countCovariates.input_file = List(inBam)
+
+    // The following is not a list, so just assigning one file to another
+    countCovariates.recal_file = outRecalFile
+
+    add(countCovariates)
+  }
+

Specifying an alternate GATK jar

+

By default Queue runs the GATK from the current classpath. This works best since the extensions are generated and compiled at time same time the GATK is compiled via ant queue.

+

If you need to swap in a different version of the GATK you may not be able to use the generated extensions. The alternate GATK jar must have the same command line arguments as the GATK compiled with Queue. Otherwise the arguments will not match and you will get an error when Queue attempts to run the alternate GATK jar. In this case you will have to create your own custom CommandLineFunction for your analysis.

+
  def script {
+    val snps = new UnifiedGenotyper
+    snps.jarFile = new File("myPatchedGATK.jar")
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }
+

GATK scatter/gather

+

Queue currently allows QScript authors to explicitly invoke scatter/gather on GATK walkers by setting the scatter count on a function.

+
  def script {
+    val snps = new UnifiedGenotyper
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    snps.scatterCount = 20
+    add(snps)
+  }
+

This will run the UnifiedGenotyper up to 20 ways parallel and then will merge the partial VCFs back into the single snps.vcf.

+

Additional caveat

+

Some walkers are still being updated to support Queue fully. For example they may not have defined the @Input and @Output and thus Queue is unable to correctly track their dependencies, or a custom Gatherer may not be implemented yet.

\ No newline at end of file diff --git a/doc_archive/queue/QFunction_and_Command_Line_Options.md b/doc_archive/queue/QFunction_and_Command_Line_Options.md new file mode 100644 index 000000000..fd4a91d3d --- /dev/null +++ b/doc_archive/queue/QFunction_and_Command_Line_Options.md @@ -0,0 +1,243 @@ +## QFunction and Command Line Options + +http://gatkforums.broadinstitute.org/gatk/discussion/1311/qfunction-and-command-line-options + +

These are the most popular Queue command line options. For a complete and up to date list run with --help or -h. QScripts may also add additional command line options.

+

Please note that this page is out of date. We hope to update it in future but have no resources to do so at present. If you run into trouble using any of the command line arguments listed here, we recommend you check the source code for the Q arguments here. Apologies for the inconvenience.

+
+

1. Queue Command Line Options

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Command Line ArgumentDescriptionDefault
-runIf passed the scripts are run. If not passed a dry run is executed.dry run
-jobRunner <jobrunner>The job runner to dispatch jobs. Setting to Lsf706, GridEngine, or Drmaa will dispatch jobs to LSF or Grid Engine using the job settings (see below). Defaults to Shell which runs jobs on a local shell one at a time.Shell
-bsubAlias for -jobRunner Lsf706not set
-qsubAlias for -jobRunner GridEnginenot set
-statusPrints out a summary progress. If a QScript is currently running via -run, you can run the same command line with -status instead to print a summary of progress.not set
-retry <count>Retries a QFunction that returns a non-zero exit code up to count times. The QFunction must not have set jobRestartable to false.0 = no retries
-startFromScratchRestarts the graph from the beginning. If not specified for each output file specified on a QFunction, ex: /path/to/output.file, Queue will not re-run the job if a .done file is found for the all the outputs, ex: /path/to/.output.file.done.use .done files to determine if jobs are complete
-keepIntermediatesBy default Queue deletes the output files of QFunctions that set .isIntermediate to true.delete intermediate files
-statusTo <email>Email address to send status to whenever a) A job fails, or b) Queue has run all the functions it can run and is exiting.not set
-statusFrom <email>Email address to send status emails from.user@local.domain
-dot <file>If set renders the job graph to a dot file.not rendered
-l <logging_level>The minimum level of logging, DEBUG, INFO, WARN, or FATAL.INFO
-log <file>Sets the location to save log output in addition to standard out.not set
-debugSet the logging to include a lot of debugging information (SLOW!)not set
-jobReportPath to write the job report text file. If R is installed and available on the $PATH then a pdf will be generated visualizing the job report.jobPrefix.jobreport.txt
-disableJobReportDisables writing the job report.not set
-helpLists all of the command line arguments with their descriptions.not set
+

2. QFunction Options

+

The following options can be specified on the command line over overridden per QFunction.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Command Line ArgumentQFunction PropertyDescriptionDefault
-jobPrefix.jobNameThe unique name of the job. Used to prefix directories and log files. Use -jobNamePrefix on the Queue command line to replace the default prefix Q-<processid>@<host>.<jobNamePrefix>-<jobNumber>
N/A.jobOutputFileCaptures stdout and if jobErrorFile is null it captures stderr as well.<jobName>.out
N/A.jobErrorFileIf not null captures stderr.null
N/A.commandDirectoryThe directory to execute the command line from.current directory
-jobProject.jobProjectThe project name for the job.default job runner project
-jobQueue.jobQueueThe queue to dispatch the job.default job runner queue
-jobPriority.jobPriorityThe dispatch priority for the job. Lowest priority = 0. Highest priority = 100.default job runner priority
-jobNative.jobNativeArgsNative args to pass to the job runner. Currently only supported in GridEngine and Drmaa. The string is concatenated to the native arguments passed over DRMAA. Example: -w n.none
-jobResReq.jobResourceRequestsResource requests to pass to the job runner. On GridEngine this is multiple -l <req>. On LSF a single -R <req> is generated.memory reservations and limits on LSF and GridEngine
-jobEnv.jobEnvironmentNamesPredefined environment names to pass to the job runner. On GridEngine this is -pe <env>. On LSF this is -a <env>.none
-memLimit.memoryLimitThe memory limit for the job in gigabytes. Used to populate the variables residentLimit and residentRequest which can also be set separately.default job runner memory limit
-resMemLimit.residentLimitLimit for the resident memory in gigabytes. On GridEngine this is -l mem_free=<mem>. On LSF this is -R rusage[mem=<mem>].memoryLimit * 1.2
-resMemReq.residentRequestRequested amount of resident memory in gigabytes. On GridEngine this is -l h_rss=<mem>. On LSF this is -R rusage[select=<mem>].memoryLimit
+

3. Email Status Options

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Command Line ArgumentDescriptionDefault
-emailHost <hostname>SMTP host namelocalhost
-emailPort <port>SMTP port25
-emailTLSIf set uses TLS.not set
-emailSSLIf set uses SSL.not set
-emailUser <username>If set along with emailPass or emailPassFile authenticates the email with this username.not set
-emailPassFile <file>If emailUser is also set authenticates the email with contents of the file.not set
-emailPass <password>If emailUser is also set authenticates the email with this password. NOT SECURE: Use emailPassFile instead!not set
\ No newline at end of file diff --git a/doc_archive/queue/Queue_CommandLineFunctions.md b/doc_archive/queue/Queue_CommandLineFunctions.md new file mode 100644 index 000000000..c1da00307 --- /dev/null +++ b/doc_archive/queue/Queue_CommandLineFunctions.md @@ -0,0 +1,133 @@ +## Queue CommandLineFunctions + +http://gatkforums.broadinstitute.org/gatk/discussion/1312/queue-commandlinefunctions + +

1. Basic QScript run rules

+ +

2. Command Line

+

Each CommandLineFunction must define the actual command line to run as follows.

+
class MyCommandLine extends CommandLineFunction {
+  def commandLine = "myScript.sh hello world"
+}
+

Constructing a Command Line Manually

+

If you're writing a one-off CommandLineFunction that is not destined for use +by other QScripts, it's often easiest to construct the command line directly +rather than through the API methods provided in the CommandLineFunction class.

+

For example:

+
def commandLine = "cat %s | grep -v \"#\" > %s".format(files, out)
+

Constructing a Command Line using API Methods

+

If you're writing a CommandLineFunction that will become part of Queue and/or +will be used by other QScripts, however, our best practice recommendation is +to construct your command line only using the methods provided in the +CommandLineFunction class: required(), optional(), conditional(), and repeat()

+

The reason for this is that these methods automatically escape the values you +give them so that they'll be interpreted literally within the shell scripts +Queue generates to run your command, and they also manage whitespace separation of command-line tokens for you. This prevents (for example) a value like MQ > 10 from being interpreted as an output redirection by the shell, and avoids issues with values containing embedded spaces. The methods also give you the ability to turn escaping and/or whitespace separation off as needed. An example:

+
override def commandLine = super.commandLine +
+                           required("eff") +
+                           conditional(verbose, "-v") +
+                           optional("-c", config) +
+                           required("-i", "vcf") +
+                           required("-o", "vcf") +
+                           required(genomeVersion) +
+                           required(inVcf) +
+                           required(">", escape=false) +  // This will be shell-interpreted as an output redirection
+                           required(outVcf)
+

The CommandLineFunctions built into Queue, including the CommandLineFunctions +automatically generated for GATK Walkers, are all written using this pattern. +This means that when you configure a GATK Walker or one of the other built-in +CommandLineFunctions in a QScript, you can rely on all of your values being +safely escaped and taken literally when the commands are run, including values +containing characters that would normally be interpreted by the shell such as +MQ > 10.

+

Below is a brief overview of the API methods available to you in the CommandLineFunction class for safely constructing command lines:

+ +

Used for command-line arguments that are always present, e.g.:

+
required("-f", "filename")                              returns: " '-f' 'filename' "
+required("-f", "filename", escape=false)                returns: " -f filename "
+required("java")                                        returns: " 'java' "
+required("INPUT=", "myBam.bam", spaceSeparated=false)   returns: " 'INPUT=myBam.bam' "
+ +

Used for command-line arguments that may or may not be present, e.g.:

+
optional("-f", myVar) behaves like required() if myVar has a value, but returns ""
+if myVar is null/Nil/None
+ +

Used for command-line arguments that should only be included if some condition is true, e.g.:

+
conditional(verbose, "-v") returns " '-v' " if verbose is true, otherwise returns ""
+ +

Used for command-line arguments that are repeated multiple times on the command line, e.g.:

+
repeat("-f", List("file1", "file2", "file3")) returns: " '-f' 'file1' '-f' 'file2' '-f' 'file3' "
+

3. Arguments

+ +

Input and Output Files

+

So that Queue can track the input and output files of a command, CommandLineFunction @Input and @Output must be java.io.File objects.

+
class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file")
+  var inputFile: File = _
+  def commandLine = "myScript.sh -fileParam " + inputFile
+}
+

FileProvider

+

CommandLineFunction variables can also provide indirect access to java.io.File inputs and outputs via the FileProvider trait.

+
class MyCommandLine extends CommandLineFunction {
+  @Input(doc="named input file")
+  var inputFile: ExampleFileProvider = _
+  def commandLine = "myScript.sh " + inputFile
+}
+
+// An example FileProvider that stores a 'name' with a 'file'.
+class ExampleFileProvider(var name: String, var file: File) extends org.broadinstitute.sting.queue.function.FileProvider {
+  override def toString = " -fileName " + name + " -fileParam " + file
+}
+

Optional Arguments

+

Optional files can be specified via required=false, and can use the CommandLineFunction.optional() utility method, as described above:

+
class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file", required=false)
+  var inputFile: File = _
+  // -fileParam will only be added if the QScript sets inputFile on this instance of MyCommandLine
+  def commandLine = required("myScript.sh") + optional("-fileParam", inputFile)
+}
+

Collections as Arguments

+

A List or Set of files can use the CommandLineFunction.repeat() utility method, as described above:

+
class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file")
+  var inputFile: List[File] = Nil // NOTE: Do not set List or Set variables to null!
+  // -fileParam will added as many times as the QScript adds the inputFile on this instance of MyCommandLine
+  def commandLine = required("myScript.sh") + repeat("-fileParam", inputFile)
+}
+

Non-File Arguments

+

A command line function can define other required arguments via @Argument.

+
class MyCommandLine extends CommandLineFunction {
+  @Argument(doc="message to display")
+  var veryImportantMessage: String = _
+  // If the QScript does not specify the required veryImportantMessage, the pipeline will not run.
+  def commandLine = required("myScript.sh") + required(veryImportantMessage)
+}
+

4. Example: "samtools index"

+
class SamToolsIndex extends CommandLineFunction {
+  @Input(doc="bam to index") var bamFile: File = _
+  @Output(doc="bam index") var baiFile: File = _
+  def commandLine = "samtools index %s %s".format(bamFile, baiFile)
+)
+

Or, using the CommandLineFunction API methods to construct the command line with automatic shell escaping:

+
class SamToolsIndex extends CommandLineFunction {
+  @Input(doc="bam to index") var bamFile: File = _
+  @Output(doc="bam index") var baiFile: File = _
+  def commandLine = required("samtools") + required("index") + required(bamFile) + required(baiFile)
+)
\ No newline at end of file diff --git a/doc_archive/queue/Queue_custom_job_schedulers.md b/doc_archive/queue/Queue_custom_job_schedulers.md new file mode 100644 index 000000000..cff65b1d4 --- /dev/null +++ b/doc_archive/queue/Queue_custom_job_schedulers.md @@ -0,0 +1,77 @@ +## Queue custom job schedulers + +http://gatkforums.broadinstitute.org/gatk/discussion/1347/queue-custom-job-schedulers + +

Implementing a Queue JobRunner

+

The following scala methods need to be implemented for a new JobRunner. See the implementations of GridEngine and LSF for concrete full examples.

+

1. class JobRunner.start()

+

Start should to copy the settings from the CommandLineFunction into your job scheduler and invoke the command via sh <jobScript>. As an example of what needs to be implemented, here is the current contents of the start() method in MyCustomJobRunner which contains the pseudo code.

+
  def start() {
+    // TODO: Copy settings from function to your job scheduler syntax.
+
+    val mySchedulerJob = new ...
+
+    // Set the display name to 4000 characters of the description (or whatever your max is)
+    mySchedulerJob.displayName = function.description.take(4000)
+
+    // Set the output file for stdout
+    mySchedulerJob.outputFile = function.jobOutputFile.getPath
+
+    // Set the current working directory
+    mySchedulerJob.workingDirectory = function.commandDirectory.getPath
+
+    // If the error file is set specify the separate output for stderr
+    if (function.jobErrorFile != null) {
+      mySchedulerJob.errFile = function.jobErrorFile.getPath
+    }
+
+    // If a project name is set specify the project name
+    if (function.jobProject != null) {
+      mySchedulerJob.projectName = function.jobProject
+    }
+
+    // If the job queue is set specify the job queue
+    if (function.jobQueue != null) {
+      mySchedulerJob.queue = function.jobQueue
+    }
+
+    // If the resident set size is requested pass on the memory request
+    if (residentRequestMB.isDefined) {
+      mySchedulerJob.jobMemoryRequest = "%dM".format(residentRequestMB.get.ceil.toInt)
+    }
+
+    // If the resident set size limit is defined specify the memory limit
+    if (residentLimitMB.isDefined) {
+      mySchedulerJob.jobMemoryLimit = "%dM".format(residentLimitMB.get.ceil.toInt)
+    }
+
+    // If the priority is set (user specified Int) specify the priority
+    if (function.jobPriority.isDefined) {
+      mySchedulerJob.jobPriority = function.jobPriority.get
+    }
+
+    // Instead of running the function.commandLine, run "sh <jobScript>"
+    mySchedulerJob.command = "sh " + jobScript
+
+    // Store the status so it can be returned in the status method.
+    myStatus = RunnerStatus.RUNNING
+
+    // Start the job and store the id so it can be killed in tryStop
+    myJobId = mySchedulerJob.start()
+  }
+

2. class JobRunner.status

+

The status method should return one of the enum values from org.broadinstitute.sting.queue.engine.RunnerStatus:

+ +

3. object JobRunner.init()

+

Add any initialization code to the companion object static initializer. See the LSF or GridEngine implementations for how this is done.

+

4. object JobRunner.tryStop()

+

The jobs that are still in RunnerStatus.RUNNING will be passed into this function. tryStop() should send these jobs the equivalent of a Ctrl-C or SIGTERM(15), or worst case a SIGKILL(9) if SIGTERM is not available.

+

Running Queue with a new JobRunner

+

Once there is a basic implementation, you can try out the Hello World example with -jobRunner MyJobRunner.

+
java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S scala/qscript/examples/HelloWorld.scala -jobRunner MyJobRunner -run
+

If all goes well Queue should dispatch the job to your job scheduler and wait until the status returns RunningStatus.DONE and hello world should be echo'ed into the output file, possibly with other log messages.

+

See [QFunction and Command Line Options]() for more info on Queue options.

\ No newline at end of file diff --git a/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md b/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md new file mode 100644 index 000000000..3a16eda6a --- /dev/null +++ b/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md @@ -0,0 +1,335 @@ +## Queue pipeline scripts (QScripts) + +http://gatkforums.broadinstitute.org/gatk/discussion/1307/queue-pipeline-scripts-qscripts + +

1. Introduction

+

Queue pipelines are Scala 2.8 files with a bit of syntactic sugar, called QScripts. Check out the following as references.

+ +

QScripts are easiest to develop using an Integrated Development Environment. See Queue with IntelliJ IDEA for our recommended settings.

+

The following is a basic outline of a QScript:

+
import org.broadinstitute.sting.queue.QScript
+// List other imports here
+
+// Define the overall QScript here.
+class MyScript extends QScript {
+  // List script arguments here.
+  @Input(doc="My QScript inputs")
+  var scriptInput: File = _
+
+  // Create and add the functions in the script here.
+  def script = {
+     var myCL = new MyCommandLine
+     myCL.myInput = scriptInput // Example variable input
+     myCL.myOutput = new File("/path/to/output") // Example hardcoded output
+     add(myCL)
+  }
+
+}
+

2. Imports

+

Imports can be any scala or java imports in scala syntax.

+
import java.io.File
+import scala.util.Random
+import org.favorite.my._
+// etc.
+

3. Classes

+ +

4. Script method

+

The body of script should create and add Queue CommandlineFunctions.

+
class MyScript extends org.broadinstitute.sting.queue.QScript {
+  def script = add(new CommandLineFunction { def commandLine = "echo hello world" })
+}
+

5. Command Line Arguments

+ +

6. Using and writing CommandLineFunctions

+

Adding existing GATK walkers

+

See Pipelining the GATK using Queue for more information on the automatically generated Queue wrappers for GATK walkers.

+

After functions are defined they should be added to the QScript pipeline using add().

+
for (vcf <- vcfs) {
+  val ve = new VariantEval
+  ve.vcfFile = vcf
+  ve.evalFile = swapExt(vcf, "vcf", "eval")
+  add(ve)
+}
+

Defining new CommandLineFunctions

+ +

7. Examples

+ +

Hello World QScript

+

The following is a "hello world" example that runs a single command line to echo hello world.

+
import org.broadinstitute.sting.queue.QScript
+
+class HelloWorld extends QScript {
+  def script = {
+    add(new CommandLineFunction {
+      def commandLine = "echo hello world"
+    })
+  }
+}
+

The above file is checked into the Sting git repository under HelloWorld.scala. After building Queue from source, the QScript can be run with the following command:

+
java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -run
+

It should produce output similar to:

+
INFO  16:23:27,825 QScriptManager - Compiling 1 QScript 
+INFO  16:23:31,289 QScriptManager - Compilation complete 
+INFO  16:23:34,631 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,631 HelpFormatter - Program Name: org.broadinstitute.sting.queue.QCommandLine 
+INFO  16:23:34,632 HelpFormatter - Program Args: -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -run  
+INFO  16:23:34,632 HelpFormatter - Date/Time: 2011/01/14 16:23:34 
+INFO  16:23:34,632 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,632 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,634 QCommandLine - Scripting HelloWorld 
+INFO  16:23:34,651 QCommandLine - Added 1 functions 
+INFO  16:23:34,651 QGraph - Generating graph. 
+INFO  16:23:34,660 QGraph - Running jobs. 
+INFO  16:23:34,689 ShellJobRunner - Starting: echo hello world 
+INFO  16:23:34,689 ShellJobRunner - Output written to /Users/kshakir/src/Sting/Q-43031@bmef8-d8e-1.out 
+INFO  16:23:34,771 ShellJobRunner - Done: echo hello world 
+INFO  16:23:34,773 QGraph - Deleting intermediate files. 
+INFO  16:23:34,773 QCommandLine - Done 
+

ExampleUnifiedGenotyper.scala

+

This example uses automatically generated Queue compatible wrappers for the GATK. See Pipelining the GATK using Queue for more info on authoring Queue support into walkers and using walkers in Queue.

+

The ExampleUnifiedGenotyper.scala for running the UnifiedGenotyper followed by VariantFiltration can be found in the examples folder.

+

To list the command line parameters, including the required parameters, run with -help.

+
java -jar dist/Queue.jar -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala -help
+

The help output should appear similar to this:

+
INFO  10:26:08,491 QScriptManager - Compiling 1 QScript
+INFO  10:26:11,926 QScriptManager - Compilation complete
+---------------------------------------------------------
+Program Name: org.broadinstitute.sting.queue.QCommandLine
+---------------------------------------------------------
+---------------------------------------------------------
+usage: java -jar Queue.jar -S <script> [-run] [-jobRunner <job_runner>] [-bsub] [-status] [-retry <retry_failed>]
+       [-startFromScratch] [-keepIntermediates] [-statusTo <status_email_to>] [-statusFrom <status_email_from>] [-dot
+       <dot_graph>] [-expandedDot <expanded_dot_graph>] [-jobPrefix <job_name_prefix>] [-jobProject <job_project>] [-jobQueue
+       <job_queue>] [-jobPriority <job_priority>] [-memLimit <default_memory_limit>] [-runDir <run_directory>] [-tempDir
+       <temp_directory>] [-jobSGDir <job_scatter_gather_directory>] [-emailHost <emailSmtpHost>] [-emailPort <emailSmtpPort>]
+       [-emailTLS] [-emailSSL] [-emailUser <emailUsername>] [-emailPassFile <emailPasswordFile>] [-emailPass <emailPassword>]
+       [-l <logging_level>] [-log <log_to_file>] [-quiet] [-debug] [-h] -R <referencefile> -I <bamfile> [-L <intervals>]
+       [-filter <filternames>] [-filterExpression <filterexpressions>]
+
+ -S,--script <script>                                                      QScript scala file
+ -run,--run_scripts                                                        Run QScripts.  Without this flag set only
+                                                                           performs a dry run.
+ -jobRunner,--job_runner <job_runner>                                      Use the specified job runner to dispatch
+                                                                           command line jobs
+ -bsub,--bsub                                                              Equivalent to -jobRunner Lsf706
+ -status,--status                                                          Get status of jobs for the qscript
+ -retry,--retry_failed <retry_failed>                                      Retry the specified number of times after a
+                                                                           command fails.  Defaults to no retries.
+ -startFromScratch,--start_from_scratch                                    Runs all command line functions even if the
+                                                                           outputs were previously output successfully.
+ -keepIntermediates,--keep_intermediate_outputs                            After a successful run keep the outputs of
+                                                                           any Function marked as intermediate.
+ -statusTo,--status_email_to <status_email_to>                             Email address to send emails to upon
+                                                                           completion or on error.
+ -statusFrom,--status_email_from <status_email_from>                       Email address to send emails from upon
+                                                                           completion or on error.
+ -dot,--dot_graph <dot_graph>                                              Outputs the queue graph to a .dot file.  See:
+                                                                           http://en.wikipedia.org/wiki/DOT_language
+ -expandedDot,--expanded_dot_graph <expanded_dot_graph>                    Outputs the queue graph of scatter gather to
+                                                                           a .dot file.  Otherwise overwrites the
+                                                                           dot_graph
+ -jobPrefix,--job_name_prefix <job_name_prefix>                            Default name prefix for compute farm jobs.
+ -jobProject,--job_project <job_project>                                   Default project for compute farm jobs.
+ -jobQueue,--job_queue <job_queue>                                         Default queue for compute farm jobs.
+ -jobPriority,--job_priority <job_priority>                                Default priority for jobs.
+ -memLimit,--default_memory_limit <default_memory_limit>                   Default memory limit for jobs, in gigabytes.
+ -runDir,--run_directory <run_directory>                                   Root directory to run functions from.
+ -tempDir,--temp_directory <temp_directory>                                Temp directory to pass to functions.
+ -jobSGDir,--job_scatter_gather_directory <job_scatter_gather_directory>   Default directory to place scatter gather
+                                                                           output for compute farm jobs.
+ -emailHost,--emailSmtpHost <emailSmtpHost>                                Email SMTP host. Defaults to localhost.
+ -emailPort,--emailSmtpPort <emailSmtpPort>                                Email SMTP port. Defaults to 465 for ssl,
+                                                                           otherwise 25.
+ -emailTLS,--emailUseTLS                                                   Email should use TLS. Defaults to false.
+ -emailSSL,--emailUseSSL                                                   Email should use SSL. Defaults to false.
+ -emailUser,--emailUsername <emailUsername>                                Email SMTP username. Defaults to none.
+ -emailPassFile,--emailPasswordFile <emailPasswordFile>                    Email SMTP password file. Defaults to none.
+ -emailPass,--emailPassword <emailPassword>                                Email SMTP password. Defaults to none. Not
+                                                                           secure! See emailPassFile.
+ -l,--logging_level <logging_level>                                        Set the minimum level of logging, i.e.
+                                                                           setting INFO get's you INFO up to FATAL,
+                                                                           setting ERROR gets you ERROR and FATAL level
+                                                                           logging.
+ -log,--log_to_file <log_to_file>                                          Set the logging location
+ -quiet,--quiet_output_mode                                                Set the logging to quiet mode, no output to
+                                                                           stdout
+ -debug,--debug_mode                                                       Set the logging file string to include a lot
+                                                                           of debugging information (SLOW!)
+ -h,--help                                                                 Generate this help message
+
+Arguments for ExampleUnifiedGenotyper:
+ -R,--referencefile <referencefile>                          The reference file for the bam files.
+ -I,--bamfile <bamfile>                                      Bam file to genotype.
+ -L,--intervals <intervals>                                  An optional file with a list of intervals to proccess.
+ -filter,--filternames <filternames>                         A optional list of filter names.
+ -filterExpression,--filterexpressions <filterexpressions>   An optional list of filter expressions.
+
+##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR stack trace
+org.broadinstitute.sting.commandline.MissingArgumentException:
+Argument with name '--bamfile' (-I) is missing.
+Argument with name '--referencefile' (-R) is missing.
+        at org.broadinstitute.sting.commandline.ParsingEngine.validate(ParsingEngine.java:192)
+        at org.broadinstitute.sting.commandline.ParsingEngine.validate(ParsingEngine.java:172)
+        at org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:199)
+        at org.broadinstitute.sting.queue.QCommandLine$.main(QCommandLine.scala:57)
+        at org.broadinstitute.sting.queue.QCommandLine.main(QCommandLine.scala)
+##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR A GATK RUNTIME ERROR has occurred (version 1.0.5504):
+##### ERROR
+##### ERROR Please visit the wiki to see if this is a known problem
+##### ERROR If not, please post the error, with stack trace, to the GATK forum
+##### ERROR Visit our wiki for extensive documentation http://www.broadinstitute.org/gsa/wiki
+##### ERROR Visit our forum to view answers to commonly asked questions http://getsatisfaction.com/gsa
+##### ERROR
+##### ERROR MESSAGE: Argument with name '--bamfile' (-I) is missing.
+##### ERROR Argument with name '--referencefile' (-R) is missing.
+##### ERROR ------------------------------------------------------------------------------------------
+

To dry run the pipeline:

+
java \
+  -Djava.io.tmpdir=tmp \
+  -jar dist/Queue.jar \
+  -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala \
+  -R human_b36_both.fasta \
+  -I pilot2_daughters.chr20.10k-11k.bam \
+  -L chr20.interval_list \
+  -filter StrandBias -filterExpression "SB>=0.10" \
+  -filter AlleleBalance -filterExpression "AB>=0.75" \
+  -filter QualByDepth -filterExpression "QD<5" \
+  -filter HomopolymerRun -filterExpression "HRun>=4"
+

The dry run output should appear similar to this:

+
INFO  10:45:00,354 QScriptManager - Compiling 1 QScript
+INFO  10:45:04,855 QScriptManager - Compilation complete
+INFO  10:45:05,058 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,059 HelpFormatter - Program Name: org.broadinstitute.sting.queue.QCommandLine
+INFO  10:45:05,059 HelpFormatter - Program Args: -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala -R human_b36_both.fasta -I pilot2_daughters.chr20.10k-11k.bam -L chr20.interval_list -filter StrandBias -filterExpression SB>=0.10 -filter AlleleBalance -filterExpression AB>=0.75 -filter QualByDepth -filterExpression QD<5 -filter HomopolymerRun -filterExpression HRun>=4 
+INFO  10:45:05,059 HelpFormatter - Date/Time: 2011/03/24 10:45:05
+INFO  10:45:05,059 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,059 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,061 QCommandLine - Scripting ExampleUnifiedGenotyper
+INFO  10:45:05,150 QCommandLine - Added 4 functions
+INFO  10:45:05,150 QGraph - Generating graph.
+INFO  10:45:05,169 QGraph - Generating scatter gather jobs.
+INFO  10:45:05,182 QGraph - Removing original jobs.
+INFO  10:45:05,183 QGraph - Adding scatter gather jobs.
+INFO  10:45:05,231 QGraph - Regenerating graph.
+INFO  10:45:05,247 QGraph - -------
+INFO  10:45:05,252 QGraph - Pending: IntervalScatterFunction /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/scatter.intervals /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/scatter.intervals /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/scatter.intervals
+INFO  10:45:05,253 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/scatter/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,254 QGraph - -------
+INFO  10:45:05,279 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,279 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,279 QGraph - -------
+INFO  10:45:05,283 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,283 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,283 QGraph - -------
+INFO  10:45:05,287 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,287 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,288 QGraph - -------
+INFO  10:45:05,288 QGraph - Pending: SimpleTextGatherFunction /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,288 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/gather-jobOutputFile/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,289 QGraph - -------
+INFO  10:45:05,291 QGraph - Pending: java -Xmx1g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T CombineVariants -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:input0,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -B:input1,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -B:input2,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -priority input0,input1,input2 -assumeIdenticalSamples
+INFO  10:45:05,291 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/gather-out/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,292 QGraph - -------
+INFO  10:45:05,296 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantEval -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:eval,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.eval
+INFO  10:45:05,296 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-2.out
+INFO  10:45:05,296 QGraph - -------
+INFO  10:45:05,299 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantFiltration -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:vcf,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.vcf -filter SB>=0.10 -filter AB>=0.75 -filter QD<5 -filter HRun>=4 -filterName StrandBias -filterName AlleleBalance -filterName QualByDepth -filterName HomopolymerRun
+INFO  10:45:05,299 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-3.out
+INFO  10:45:05,302 QGraph - -------
+INFO  10:45:05,303 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantEval -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:eval,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.eval
+INFO  10:45:05,303 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-4.out
+INFO  10:45:05,304 QGraph - Dry run completed successfully!
+INFO  10:45:05,304 QGraph - Re-run with "-run" to execute the functions.
+INFO  10:45:05,304 QCommandLine - Done
+

8. Using traits to pass common values between QScripts to CommandLineFunctions

+

QScript files often create multiple CommandLineFunctions with similar arguments. Use various scala tricks such as inner classes, traits / mixins, etc. to reuse variables.

+ +

See the following example:

+
class MyScript extends org.broadinstitute.sting.queue.QScript {
+  // Create an alias 'qscript' for 'MyScript.this'
+  qscript =>
+
+  // This is a script argument
+  @Argument(doc="message to display")
+  var message: String = _
+
+  // This is a script argument
+  @Argument(doc="number of times to display")
+  var count: Int = _
+
+  trait ReusableArguments extends MyCommandLineFunction {
+    // Whenever a function is created 'with' this trait, it will copy the message.
+    this.commandLineMessage = qscript.message
+  }
+
+  abstract class MyCommandLineFunction extends CommandLineFunction {
+     // This is a per command line argument
+     @Argument(doc="message to display")
+     var commandLineMessage: String = _
+  }
+
+  class MyEchoFunction extends MyCommandLineFunction {
+     def commandLine = "echo " + commandLineMessage
+  }
+
+  class MyAlsoEchoFunction extends MyCommandLineFunction {
+     def commandLine = "echo also " + commandLineMessage
+  }
+
+  def script = {
+    for (i <- 1 to count) {
+      val echo = new MyEchoFunction with ReusableArguments
+      val alsoEcho = new MyAlsoEchoFunction with ReusableArguments
+      add(echo, alsoEcho)
+    }
+  }
+}
\ No newline at end of file diff --git a/doc_archive/queue/Queue_with_Grid_Engine.md b/doc_archive/queue/Queue_with_Grid_Engine.md new file mode 100644 index 000000000..abb931e16 --- /dev/null +++ b/doc_archive/queue/Queue_with_Grid_Engine.md @@ -0,0 +1,45 @@ +## Queue with Grid Engine + +http://gatkforums.broadinstitute.org/gatk/discussion/1313/queue-with-grid-engine + +

1. Background

+

Thanks to contributions from the community, Queue contains a job runner compatible with Grid Engine 6.2u5.

+

As of July 2011 this is the currently known list of forked distributions of Sun's Grid Engine 6.2u5. As long as they are JDRMAA 1.0 source compatible with Grid Engine 6.2u5, the compiled Queue code should run against each of these distributions. However we have yet to receive confirmation that Queue works on any of these setups.

+ +

Our internal QScript integration tests run the same tests on both LSF 7.0.6 and a Grid Engine 6.2u5 cluster setup on older software released by Sun.

+

If you run into trouble, please let us know. If you would like to contribute additions or bug fixes please create a fork in our github repo where we can review and pull in the patch.

+

2. Running Queue with GridEngine

+

Try out the Hello World example with -jobRunner GridEngine.

+
java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/examples/HelloWorld.scala -jobRunner GridEngine -run
+

If all goes well Queue should dispatch the job to Grid Engine and wait until the status returns RunningStatus.DONE and "hello world should be echoed into the output file, possibly with other grid engine log messages.

+

See QFunction and Command Line Options for more info on Queue options.

+

3. Debugging issues with Queue and GridEngine

+

If you run into an error with Queue submitting jobs to GridEngine, first try submitting the HelloWorld example with -memLimit 2:

+
java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/examples/HelloWorld.scala -jobRunner GridEngine -run -memLimit 2
+

Then try the following GridEngine qsub commands. They are based on what Queue submits via the API when running the HelloWorld.scala example with and without memory reservations and limits:

+
qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=2048M -l h_rss=2458M echo hello world
+

One other thing to check is if there is a memory limit on your cluster. For example try submitting jobs with up to 16G.

+
qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=4096M -l h_rss=4915M echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=8192M -l h_rss=9830M echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=16384M -l h_rss=19960M echo hello world
+

If the above tests pass and GridEngine will still not dispatch jobs submitted by Queue please report the issue to our support forum.

\ No newline at end of file diff --git a/doc_archive/queue/Queue_with_IntelliJ_IDEA.md b/doc_archive/queue/Queue_with_IntelliJ_IDEA.md new file mode 100644 index 000000000..dbad75591 --- /dev/null +++ b/doc_archive/queue/Queue_with_IntelliJ_IDEA.md @@ -0,0 +1,170 @@ +## Queue with IntelliJ IDEA + +http://gatkforums.broadinstitute.org/gatk/discussion/1309/queue-with-intellij-idea + +

We have found it that Queue works best with IntelliJ IDEA Community Edition (free) or Ultimate Edition installed with the Scala Plugin enabled. Once you have downloaded IntelliJ IDEA, follow the instructions below to setup a Sting project with Queue and the Scala Plugin.

+

[[File:sting_project_libraries.png|300px|thumb|right|Project Libraries]] +[[File:sting_module_sources.png|300px|thumb|right|Module Sources]] +[[File:sting_module_dependencies.png|300px|thumb|right|Module Dependencies]] +[[File:sting_module_scala_facet.png|300px|thumb|right|Scala Facet]]

+

1. Build Queue on the Command Line

+

Build Queue from source from the command line with ant queue, so that:

+ +

2. Add the scala plugin

+ +

3. Creating a new Sting Project including Queue

+ +

4. Enable annotation processing

+ +

5. Debugging Queue

+

Adding a Remote Configuration

+

[[File:queue_debug.png|300px|thumb|right|Queue Remote Debug]]

+ +

Running with the Remote Configuration

+ +

6. Binding javadocs and source

+

From Stack overflow:

+

Add javadocs:

+

Point IntelliJ to http://download.oracle.com/javase/6/docs/api/.
+Go to File -> Project Structure -> SDKs -> Apple 1.x -> DocumentationPaths, and the click specify URL.

+

Add sources:

+

In IntelliJ, open File -> Project Structure. +Click on "SDKs" under "Platform Settings". +Add the following path under the Sourcepath tab: +/Library/Java/JavaVirtualMachines/1.6.0_29-b11-402.jdk/Contents/Home/src.jar!/src

\ No newline at end of file diff --git a/doc_archive/queue/The_10+_Queuemandents.md b/doc_archive/queue/The_10+_Queuemandents.md new file mode 100644 index 000000000..c1257cbda --- /dev/null +++ b/doc_archive/queue/The_10+_Queuemandents.md @@ -0,0 +1,20 @@ +## The 10+ Queuemandents + +http://gatkforums.broadinstitute.org/gatk/discussion/8027/the-10-queuemandents + +

In no particular order:

+ \ No newline at end of file diff --git a/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md b/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md new file mode 100644 index 000000000..ffcc7ae27 --- /dev/null +++ b/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md @@ -0,0 +1,137 @@ +## Writing unit / regression tests for QScripts + +http://gatkforums.broadinstitute.org/gatk/discussion/1353/writing-unit-regression-tests-for-qscripts + +

In addition to testing walkers individually, you may want to also run integration tests for your QScript pipelines.

+

1. Brief comparison to the Walker integration tests

+ +

2. PipelineTestSpec

+

When building up a pipeline test spec specify the following variables for your test.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VariableTypeDescription
argsStringThe arguments to pass to the Queue test, ex: -S scala/qscript/examples/HelloWorld.scala
jobQueueStringJob Queue to run the test. Default is null which means use hour.
fileMD5sMap[Path, MD5]Expected MD5 results for each file path.
expectedExceptionclassOf[Exception]Expected exception from the test.
+

3. Example PipelineTest

+

The following example runs the ExampleCountLoci QScript on a small bam and verifies that the MD5 result is as expected.

+

It is checked into the Sting repository under scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala

+
package org.broadinstitute.sting.queue.pipeline.examples
+
+import org.testng.annotations.Test
+import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
+import org.broadinstitute.sting.BaseTest
+
+class ExampleCountLociPipelineTest {
+  @Test
+  def testCountLoci {
+    val testOut = "count.out"
+    val spec = new PipelineTestSpec
+    spec.name = "countloci"
+    spec.args = Array(
+      " -S scala/qscript/examples/ExampleCountLoci.scala",
+      " -R " + BaseTest.hg18Reference,
+      " -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam",
+      " -o " + testOut).mkString
+    spec.fileMD5s += testOut -> "67823e4722495eb10a5e4c42c267b3a6"
+    PipelineTest.executeTest(spec)
+  }
+}
+

3. Running Pipeline Tests

+

Dry Run

+

To test if the script is at least compiling with your arguments run ant pipelinetest specifying the name of your class to -Dsingle:

+
ant pipelinetest -Dsingle=ExampleCountLociPipelineTest
+

Sample output:

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour
+   [testng]   => countloci PASSED DRY RUN
+   [testng] PASSED: testCountLoci
+

Run

+

As of July 2011 the pipeline tests run against LSF 7.0.6 and Grid Engine 6.2u5. To include these two packages in your environment use the hidden dotkit .combined_LSF_SGE.

+
reuse .combined_LSF_SGE
+

Once you are satisfied that the dry run has completed without error, to actually run the pipeline test run ant pipelinetestrun.

+
ant pipelinetestrun -Dsingle=ExampleCountLociPipelineTest
+

Sample output:

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] Checking MD5 for pipelinetests/countloci/run/count.out [calculated=67823e4722495eb10a5e4c42c267b3a6, expected=67823e4722495eb10a5e4c42c267b3a6]
+   [testng]   => countloci PASSED
+   [testng] PASSED: testCountLoci
+

Generating initial MD5s

+

If you don't know the MD5s yet you can run the command yourself on the command line and then MD5s the outputs yourself, or you can set the MD5s in your test to "" and run the pipeline.

+

When the MD5s are blank as in:

+
spec.fileMD5s += testOut -> ""
+

You run:

+
ant pipelinetest -Dsingle=ExampleCountLociPipelineTest -Dpipeline.run=run
+

And the output will look like:

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is , equal? = false
+   [testng]   => countloci PASSED
+   [testng] PASSED: testCountLoci
+

Checking MD5s

+

When a pipeline test fails due to an MD5 mismatch you can use the MD5 database to diff the results.

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### Updating MD5 file: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] Checking MD5 for pipelinetests/countloci/run/count.out [calculated=67823e4722495eb10a5e4c42c267b3a6, expected=67823e4722495eb10a5e0000deadbeef]
+   [testng] ##### Test countloci is going fail #####
+   [testng] ##### Path to expected   file (MD5=67823e4722495eb10a5e0000deadbeef): integrationtests/67823e4722495eb10a5e0000deadbeef.integrationtest
+   [testng] ##### Path to calculated file (MD5=67823e4722495eb10a5e4c42c267b3a6): integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] ##### Diff command: diff integrationtests/67823e4722495eb10a5e0000deadbeef.integrationtest integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] FAILED: testCountLoci
+   [testng] java.lang.AssertionError: 1 of 1 MD5s did not match.
+

If you need to examine a number of MD5s which may have changed you can briefly shut off MD5 mismatch failures by setting parameterize = true.

+
spec.parameterize = true
+spec.fileMD5s += testOut -> "67823e4722495eb10a5e4c42c267b3a6"
+

For this run:

+
ant pipelinetest -Dsingle=ExampleCountLociPipelineTest -Dpipeline.run=run
+

If there's a match the output will resemble:

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is 67823e4722495eb10a5e4c42c267b3a6, equal? = true
+   [testng]   => countloci PASSED
+   [testng] PASSED: testCountLoci
+

While for a mismatch it will look like this:

+
   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is 67823e4722495eb10a5e0000deadbeef, equal? = false
+   [testng]   => countloci PASSED
+   [testng] PASSED: testCountLoci
\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md b/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md new file mode 100644 index 000000000..74e744b67 --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md @@ -0,0 +1,46 @@ +## (How to) Create a snippet of reads corresponding to a genomic interval + +http://gatkforums.broadinstitute.org/gatk/discussion/6517/how-to-create-a-snippet-of-reads-corresponding-to-a-genomic-interval + +

Tools involved

+ +

Prerequisites

+ +

Download example data

+ +

Related resources

+ +
+

Create a snippet of reads corresponding to a genomic interval using PrintReads

+

PrintReads merges or subsets sequence data. The tool automatically applies MalformedReadFilter and BadCigarFilter to filter out certain types of reads that cause problems for downstream GATK tools, e.g. reads with mismatching numbers of bases and base qualities or reads with CIGAR strings containing the N operator.

+ +

Subsetting reads corresponding to a genomic interval using PrintReads requires reads that are aligned to a reference genome, coordinate-sorted and indexed. Place the .bai index in the same directory as the .bam file.

+
java -Xmx8G -jar /path/GenomeAnalysisTK.jar \
+    -T PrintReads \ 
+    -R /path/human_g1k_v37_decoy.fasta \ #reference fasta
+    -L 10:91000000-92000000 \ #desired genomic interval chr:start-end
+    -I 6517_2Mbp_input.bam \ #input
+    -o 6517_1Mbp_output.bam 
+

This creates a subset of reads from the input file, 6517_2Mbp_input.bam, that align to the interval defined by the -L option, here a 1 Mbp region on chromosome 10. The tool creates two new files, 6517_1Mbp_output.bam and corresponding index 6517_1Mbp_output.bai.

+ +

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+
\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md b/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md new file mode 100644 index 000000000..0a10046b4 --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md @@ -0,0 +1,92 @@ +## (How to) Fix a badly formatted BAM + +http://gatkforums.broadinstitute.org/gatk/discussion/2909/how-to-fix-a-badly-formatted-bam + +

+

Fix a BAM that is not indexed or not sorted, has not had duplicates marked, or is lacking read group information. The options on this page are listed in order of decreasing complexity.

+

You may ask, is all of this really necessary? The GATK imposes strict formatting guidelines, including requiring certain read group information, that other software packages do not require. Although this represents a small additional processing burden upfront, the downstream benefits are numerous, including the ability to process library data individually, and significant gains in speed and parallelization options.

+

Prerequisites

+ +

Jump to a section on this page

+
    +
  1. Add read groups, coordinate sort and index using AddOrReplaceReadGroups
  2. +
  3. Coordinate sort and index using SortSam
  4. +
  5. Index an already coordinate-sorted BAM using BuildBamIndex
  6. +
  7. Mark duplicates using MarkDuplicates
  8. +
+

Tools involved

+ +

Related resources

+ +

+
+

1. Add read groups, coordinate sort and index using AddOrReplaceReadGroups

+

Use Picard's AddOrReplaceReadGroups to appropriately label read group (@RG) fields, coordinate sort and index a BAM file. Only the five required @RG fields are included in the command shown. Consider the other optional @RG fields for better record keeping.

+
java -jar picard.jar AddOrReplaceReadGroups \ 
+    INPUT=reads.bam \ 
+    OUTPUT=reads_addRG.bam \ 
+    RGID=H0164.2 \ #be sure to change from default of 1
+    RGLB= library1 \ 
+    RGPL=illumina \ 
+    RGPU=H0164ALXX140820.2 \ 
+    RGSM=sample1 \ 
+

This creates a file called reads_addRG.bam with the same content and sorting as the input file, except the SAM record header's @RG line will be updated with the new information for the specified fields and each read will now have an RG tag filled with the @RG ID field information. Because of this repetition, the length of the @RG ID field contributes to file size.

+

To additionally coordinate sort by genomic location and create a .bai index, add the following options to the command.

+
    SORT_ORDER=coordinate \ 
+    CREATE_INDEX=true
+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+

+
+

2. Coordinate sort and index using SortSam

+

Picard's SortSam both sorts and indexes and converts between SAM and BAM formats. For coordinate sorting, reads must be aligned to a reference genome.

+
java -jar picard.jar SortSam \ 
+    INPUT=reads.bam \ 
+    OUTPUT=reads_sorted.bam \ 
+    SORT_ORDER=coordinate \
+

Concurrently index by tacking on the following option.

+
    CREATE_INDEX=true
+

This creates a file called reads_sorted.bam containing reads sorted by genomic location, aka coordinate, and a .bai index file with the same prefix as the output, e.g. reads_sorted.bai, within the same directory.

+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+

+
+

3. Index an already coordinate-sorted BAM using BuildBamIndex

+

Picard's BuildBamIndex allows you to index a BAM that is already coordinate sorted.

+
java -jar picard.jar BuildBamIndex \ 
+    INPUT=reads_sorted.bam 
+

This creates a .bai index file with the same prefix as the input file, e.g. reads_sorted.bai, within the same directory as the input file. You want to keep this default behavior as many tools require the same prefix and directory location for the pair of files. Note that Picard tools do not systematically create an index file when they output a new BAM file, whereas GATK tools will always output indexed files.

+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+

+
+

4. Mark duplicates using MarkDuplicates

+

Picard's MarkDuplicates flags both PCR and optical duplicate reads with a 1024 (0x400) SAM flag. The input BAM must be coordinate sorted.

+
java -jar picard.jar MarkDuplicates \ 
+    INPUT=reads_sorted.bam \ 
+    OUTPUT=reads_markdup.bam \
+    METRICS_FILE=metrics.txt \
+    CREATE_INDEX=true
+

This creates a file called reads_markdup.bam with duplicate reads marked. It also creates a file called metrics.txt containing duplicate read data metrics and a .bai index file.

+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+ +

back to top

+
\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md b/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md new file mode 100644 index 000000000..83070e01a --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md @@ -0,0 +1,125 @@ +## (How to) Generate an unmapped BAM from FASTQ or aligned BAM + +http://gatkforums.broadinstitute.org/gatk/discussion/6484/how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam + +

+ Here we outline how to generate an unmapped BAM (uBAM) from either a FASTQ or aligned BAM file. We use Picard's FastqToSam to convert a FASTQ (Option A) or Picard's RevertSam to convert an aligned BAM (Option B).

+

Jump to a section on this page

+

(A) Convert FASTQ to uBAM and add read group information using FastqToSam +(B) Convert aligned BAM to uBAM and discard problematic records using RevertSam

+

Tools involved

+ +

Prerequisites

+ +

Download example data

+ +

Tutorial data reads were originally aligned to the advanced tutorial bundle's human_g1k_v37_decoy.fasta reference and to 10:91,000,000-92,000,000.

+

Related resources

+ +
+

+

(A) Convert FASTQ to uBAM and add read group information using FastqToSam

+

Picard's FastqToSam transforms a FASTQ file to an unmapped BAM, requires two read group fields and makes optional specification of other read group fields. In the command below we note which fields are required for GATK Best Practices Workflows. All other read group fields are optional.

+
java -Xmx8G -jar picard.jar FastqToSam \
+    FASTQ=6484_snippet_1.fastq \ #first read file of pair
+    FASTQ2=6484_snippet_2.fastq \ #second read file of pair
+    OUTPUT=6484_snippet_fastqtosam.bam \
+    READ_GROUP_NAME=H0164.2 \ #required; changed from default of A
+    SAMPLE_NAME=NA12878 \ #required
+    LIBRARY_NAME=Solexa-272222 \ #required 
+    PLATFORM_UNIT=H0164ALXX140820.2 \ 
+    PLATFORM=illumina \ #recommended
+    SEQUENCING_CENTER=BI \ 
+    RUN_DATE=2014-08-20T00:00:00-0400
+

Some details on select parameters:

+ +

Paired reads will have SAM flag values that reflect pairing and the fact that the reads are unmapped as shown in the example read pair below.

+

Original first read

+
@H0164ALXX140820:2:1101:10003:49022/1
+ACTTTAGAAATTTACTTTTAAGGACTTTTGGTTATGCTGCAGATAAGAAATATTCTTTTTTTCTCCTATGTCAGTATCCCCCATTGAAATGACAATAACCTAATTATAAATAAGAATTAGGCTTTTTTTTGAACAGTTACTAGCCTATAGA
++
+-FFFFFJJJJFFAFFJFJJFJJJFJFJFJJJ<<FJJJJFJFJFJJJJ<JAJFJJFJJJJJFJJJAJJJJJJFFJFJFJJFJJFFJJJFJJJFJJFJJFJAJJJJAJFJJJJJFFJJ<<<JFJJAFJAAJJJFFFFFJJJAJJJF<AJFFFJ
+

Original second read

+
@H0164ALXX140820:2:1101:10003:49022/2
+TGAGGATCACTAGATGGGGGAGGGAGAGAAGAGATGTGGGCTGAAGAACCATCTGTTGGGTAATATGTTTACTGTCAGTGTGATGGAATAGCTGGGACCCCAAGCGTCAGTGTTACACAACTTACATCTGTTGATCGACTGTCTATGACAG
++
+AA<FFJJJAJFJFAFJJJJFAJJJJJ7FFJJ<F-FJFJJJFJJFJJFJJF<FJJA<JF-AFJFAJFJJJJJAAAFJJJJJFJJF-FF<7FJJJJJJ-JA<<J<F7-<FJFJJ7AJAF-AFFFJA--J-F######################
+

After FastqToSam

+
H0164ALXX140820:2:1101:10003:49022      77      *       0       0       *       *       0       0       ACTTTAGAAATTTACTTTTAAGGACTTTTGGTTATGCTGCAGATAAGAAATATTCTTTTTTTCTCCTATGTCAGTATCCCCCATTGAAATGACAATAACCTAATTATAAATAAGAATTAGGCTTTTTTTTGAACAGTTACTAGCCTATAGA -FFFFFJJJJFFAFFJFJJFJJJFJFJFJJJ<<FJJJJFJFJFJJJJ<JAJFJJFJJJJJFJJJAJJJJJJFFJFJFJJFJJFFJJJFJJJFJJFJJFJAJJJJAJFJJJJJFFJJ<<<JFJJAFJAAJJJFFFFFJJJAJJJF<AJFFFJ RG:Z:H0164.2
+H0164ALXX140820:2:1101:10003:49022      141     *       0       0       *       *       0       0       TGAGGATCACTAGATGGGGGAGGGAGAGAAGAGATGTGGGCTGAAGAACCATCTGTTGGGTAATATGTTTACTGTCAGTGTGATGGAATAGCTGGGACCCCAAGCGTCAGTGTTACACAACTTACATCTGTTGATCGACTGTCTATGACAG AA<FFJJJAJFJFAFJJJJFAJJJJJ7FFJJ<F-FJFJJJFJJFJJFJJF<FJJA<JF-AFJFAJFJJJJJAAAFJJJJJFJJF-FF<7FJJJJJJ-JA<<J<F7-<FJFJJ7AJAF-AFFFJA--J-F###################### RG:Z:H0164.2
+

back to top

+
+

+

(B) Convert aligned BAM to uBAM and discard problematic records using RevertSam

+

We use Picard's RevertSam to remove alignment information and generate an unmapped BAM (uBAM). For our tutorial file we have to call on some additional parameters that we explain below. This illustrates the need to cater the tool's parameters to each dataset. As such, it is a good idea to test the reversion process on a subset of reads before committing to reverting the entirety of a large BAM. Follow the directions in this How to to create a snippet of aligned reads corresponding to a genomic interval.

+

We use the following parameters.

+
java -Xmx8G -jar /path/picard.jar RevertSam \
+    I=6484_snippet.bam \
+    O=6484_snippet_revertsam.bam \
+    SANITIZE=true \ 
+    MAX_DISCARD_FRACTION=0.005 \ #informational; does not affect processing
+    ATTRIBUTE_TO_CLEAR=XT \
+    ATTRIBUTE_TO_CLEAR=XN \
+    ATTRIBUTE_TO_CLEAR=AS \ #Picard release of 9/2015 clears AS by default
+    ATTRIBUTE_TO_CLEAR=OC \
+    ATTRIBUTE_TO_CLEAR=OP \
+    SORT_ORDER=queryname \ #default
+    RESTORE_ORIGINAL_QUALITIES=true \ #default
+    REMOVE_DUPLICATE_INFORMATION=true \ #default
+    REMOVE_ALIGNMENT_INFORMATION=true #default
+

To process large files, also designate a temporary directory.

+
    TMP_DIR=/path/shlee #sets environmental variable for temporary directory
+

We invoke or change multiple RevertSam parameters to generate an unmapped BAM

+ +

Some comments on options kept at default:

+ +

Below we show below a read pair before and after RevertSam from the tutorial data. Notice the first listed read in the pair becomes reverse-complemented after RevertSam. This restores how reads are represented when they come off the sequencer--5' to 3' of the read being sequenced.

+

For 6484_snippet.bam, SANITIZE removes 2,202 out of 279,796 (0.787%) reads, leaving us with 277,594 reads.

+

Original BAM

+
H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA :<<=>@AAB@AA@AA>6@@A:>,*@A@<@??@8?9>@==8?:?@?;?:><??@>==9?>8>@:?>>=>;<==>>;>?=?>>=<==>>=>9<=>??>?>;8>?><?<=:>>>;4>=>7=6>=>>=><;=;>===?=>=>>?9>>>>??==== MC:Z:60M91S MD:Z:151    PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:0  MQ:i:0  OQ:Z:<FJFFJJJJFJJJJJF7JJJ<F--JJJFJJJJ<J<FJFF<JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ<J7JJJJFJ<AFAJJJJJFJJJJJAJFJJAFFFFA    UQ:i:0  AS:i:151
+
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC :0;.=;8?7==?794<<;:>769=,<;0:=<0=:9===/,:-==29>;,5,98=599;<=########################################################################################### SA:Z:2,33141573,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:2  MQ:i:60 OQ:Z:<-<-FA<F<FJF<A7AFAAJ<<AA-FF-AJF-FA<AFF--A-FA7AJA-7-A<F7<<AFF###########################################################################################    UQ:i:49 AS:i:50
+

After RevertSam

+
H0164ALXX140820:2:1101:10003:23460  77  *   0   0   *   *   0   0   TGAGCTGGAAAGATTGCTTTTGCCCTGAAGTCTGAGGCGGCAGTGAGCCATGACTGCACCACTGCATTCCAGCCTGGGTGACAGAACAAGACCTTGTCTCTTTAAAAGAGGAAAGAAAAGGGAAAGGGAAAGGGAAGGGGAAGGGGATGGG AFFFFAJJFJAJJJJJFJJJJJAFA<JFJJJJ7J<JJJFFJJJFJFJFJJJAFJJJJJJJFFJJJJFJFJJJJFJJFJJJJJFJJJJJAJJAJFAJFJJJFFJAJAJJJAJ<FFJF<J<JJJJFJJJ--F<JJJ7FJJJJJFJJJJFFJF< RG:Z:H0164.2
+
+H0164ALXX140820:2:1101:10003:23460  141 *   0   0   *   *   0   0   TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC <-<-FA<F<FJF<A7AFAAJ<<AA-FF-AJF-FA<AFF--A-FA7AJA-7-A<F7<<AFF########################################################################################### RG:Z:H0164.2
+

back to top

+
\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md b/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md new file mode 100644 index 000000000..f3a8d1bf7 --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md @@ -0,0 +1,295 @@ +## (How to) Map and clean up short read sequence data efficiently + +http://gatkforums.broadinstitute.org/gatk/discussion/6483/how-to-map-and-clean-up-short-read-sequence-data-efficiently + +

+If you are interested in emulating the methods used by the Broad Genomics Platform to pre-process your short read sequencing data, you have landed on the right page. The parsimonious operating procedures outlined in this three-step workflow both maximize data quality, storage and processing efficiency to produce a mapped and clean BAM. This clean BAM is ready for analysis workflows that start with MarkDuplicates.

+

Since your sequencing data could be in a number of formats, the first step of this workflow refers you to specific methods to generate a compatible unmapped BAM (uBAM, Tutorial#6484) or (uBAMXT, Tutorial#6570 coming soon). Not all unmapped BAMs are equal and these methods emphasize cleaning up prior meta information while giving you the opportunity to assign proper read group fields. The second step of the workflow has you marking adapter sequences, e.g. arising from read-through of short inserts, using MarkIlluminaAdapters such that they contribute minimally to alignments and allow the aligner to map otherwise unmappable reads. The third step pipes three processes to produce the final BAM. Piping SamToFastq, BWA-MEM and MergeBamAlignment saves time and allows you to bypass storage of larger intermediate FASTQ and SAM files. In particular, MergeBamAlignment merges defined information from the aligned SAM with that of the uBAM to conserve read data, and importantly, it generates additional meta information and unifies meta data. The resulting clean BAM is coordinate sorted, indexed.

+
+

The workflow reflects a lossless operating procedure that retains original sequencing read information within the final BAM file such that data is amenable to reversion and analysis by different means. These practices make scaling up and long-term storage efficient, as one needs only keep the final BAM file.

+
+

Geraldine_VdAuwera points out that there are many different ways of correctly preprocessing HTS data for variant discovery and ours is only one approach. So keep this in mind.

+

We present this workflow using real data from a public sample. The original data file, called Solexa-272222, is large at 150 GB. The file contains 151 bp paired PCR-free reads giving 30x coverage of a human whole genome sample referred to as NA12878. The entire sample library was sequenced in a single flow cell lane and thereby assigns all the reads the same read group ID. The example commands work both on this large file and on smaller files containing a subset of the reads, collectively referred to as snippet. NA12878 has a variant in exon 5 of the CYP2C19 gene, on the portion of chromosome 10 covered by the snippet, resulting in a nonfunctional protein. Consistent with GATK's recommendation of using the most up-to-date tools, for the given example results, with the exception of BWA, we used the most current versions of tools as of their testing (September to December 2015). We provide illustrative example results, some of which were derived from processing the original large file and some of which show intermediate stages skipped by this workflow.

+
+

Download example snippet data to follow along the tutorial.

+
+

We welcome feedback. Share your suggestions in the Comments section at the bottom of this page.

+
+

Jump to a section

+
    +
  1. Generate an unmapped BAM from FASTQ, aligned BAM or BCL
  2. +
  3. Mark adapter sequences using MarkIlluminaAdapters
  4. +
  5. Align reads with BWA-MEM and merge with uBAM using MergeBamAlignment +A. Convert BAM to FASTQ and discount adapter sequences using SamToFastq +B. Align reads and flag secondary hits using BWA-MEM +C. Restore altered data and apply & adjust meta information using MergeBamAlignment +D. Pipe SamToFastq, BWA-MEM and MergeBamAlignment to generate a clean BAM
  6. +
+

Tools involved

+ +

Prerequisites

+ +

Download example data

+ + +

Related resources

+ +

Other notes

+ +
+

+

1. Generate an unmapped BAM from FASTQ, aligned BAM or BCL

+

If you have raw reads data in BAM format with appropriately assigned read group fields, then you can start with step 2. Namely, besides differentiating samples, the read group ID should differentiate factors contributing to technical batch effects, i.e. flow cell lane. If not, you need to reassign read group fields. This dictionary post describes factors to consider and this post and this post provide some strategic advice on handling multiplexed data.

+ +

If your reads are mapped, or in BCL or FASTQ format, then generate an unmapped BAM according to the following instructions.

+ +
+

See if you can revert 6483_snippet.bam, containing 279,534 aligned reads, to the unmapped 6383_snippet_revertsam.bam, containing 275,546 reads.

+
+

back to top

+
+

+

2. Mark adapter sequences using MarkIlluminaAdapters

+

MarkIlluminaAdapters adds the XT tag to a read record to mark the 5' start position of the specified adapter sequence and produces a metrics file. Some of the marked adapters come from concatenated adapters that randomly arise from the primordial soup that is a PCR reaction. Others represent read-through to 3' adapter ends of reads and arise from insert sizes that are shorter than the read length. In some instances read-though can affect the majority of reads in a sample, e.g. in Nextera library samples over-titrated with transposomes, and render these reads unmappable by certain aligners. Tools such as SamToFastq use the XT tag in various ways to effectively remove adapter sequence contribution to read alignment and alignment scoring metrics. Depending on your library preparation, insert size distribution and read length, expect varying amounts of such marked reads.

+
java -Xmx8G -jar /path/picard.jar MarkIlluminaAdapters \
+I=6483_snippet_revertsam.bam \
+O=6483_snippet_markilluminaadapters.bam \
+M=6483_snippet_markilluminaadapters_metrics.txt \ #naming required
+TMP_DIR=/path/shlee #optional to process large files
+

This produces two files. (1) The metrics file, 6483_snippet_markilluminaadapters_metrics.txt bins the number of tagged adapter bases versus the number of reads. (2) The 6483_snippet_markilluminaadapters.bam file is identical to the input BAM, 6483_snippet_revertsam.bam, except reads with adapter sequences will be marked with a tag in XT:i:# format, where # denotes the 5' starting position of the adapter sequence. At least six bases are required to mark a sequence. Reads without adapter sequence remain untagged.

+ +

We plot the metrics data that is in GATKReport file format using RStudio, and as you can see, marked bases vary in size up to the full length of reads. +

+
+

Do you get the same number of marked reads? 6483_snippet marks 448 reads (0.16%) with XT, while the original Solexa-272222 marks 3,236,552 reads (0.39%).

+
+

Below, we show a read pair marked with the XT tag by MarkIlluminaAdapters. The insert region sequences for the reads overlap by a length corresponding approximately to the XT tag value. For XT:i:20, the majority of the read is adapter sequence. The same read pair is shown after SamToFastq transformation, where adapter sequence base quality scores have been set to 2 (# symbol), and after MergeBamAlignment, which restores original base quality scores.

+

Unmapped uBAM (step 1)

+ +

After MarkIlluminaAdapters (step 2)

+ +

After SamToFastq (step 3)

+ +

After MergeBamAlignment (step 3)

+ +

back to top

+
+

+

3. Align reads with BWA-MEM and merge with uBAM using MergeBamAlignment

+

This step actually pipes three processes, performed by three different tools. Our tutorial example files are small enough to easily view, manipulate and store, so any difference in piped or independent processing will be negligible. For larger data, however, using Unix pipelines can add up to significant savings in processing time and storage.

+
+

Not all tools are amenable to piping and piping the wrong tools or wrong format can result in anomalous data.

+
+

The three tools we pipe are SamToFastq, BWA-MEM and MergeBamAlignment. By piping these we bypass storage of larger intermediate FASTQ and SAM files. We additionally save time by eliminating the need for the processor to read in and write out data for two of the processes, as piping retains data in the processor's input-output (I/O) device for the next process.

+

To make the information more digestible, we will first talk about each tool separately. At the end of the section, we provide the piped command.

+

back to top

+
+

+

3A. Convert BAM to FASTQ and discount adapter sequences using SamToFastq

+

Picard's SamToFastq takes read identifiers, read sequences, and base quality scores to write a Sanger FASTQ format file. We use additional options to effectively remove previously marked adapter sequences, in this example marked with an XT tag. By specifying CLIPPING_ATTRIBUTE=XT and CLIPPING_ACTION=2, SamToFastq changes the quality scores of bases marked by XT to two--a rather low score in the Phred scale. This effectively removes the adapter portion of sequences from contributing to downstream read alignment and alignment scoring metrics.

+

Illustration of an intermediate step unused in workflow. See piped command.

+
java -Xmx8G -jar /path/picard.jar SamToFastq \
+I=6483_snippet_markilluminaadapters.bam \
+FASTQ=6483_snippet_samtofastq_interleaved.fq \
+CLIPPING_ATTRIBUTE=XT \
+CLIPPING_ACTION=2 \
+INTERLEAVE=true \ 
+NON_PF=true \
+TMP_DIR=/path/shlee #optional to process large files         
+

This produces a FASTQ file in which all extant meta data, i.e. read group information, alignment information, flags and tags are purged. What remains are the read query names prefaced with the @ symbol, read sequences and read base quality scores.

+ +

back to top

+
+

+

3B. Align reads and flag secondary hits using BWA-MEM

+

In this workflow, alignment is the most compute intensive and will take the longest time. GATK's variant discovery workflow recommends Burrows-Wheeler Aligner's maximal exact matches (BWA-MEM) algorithm (Li 2013 reference; Li 2014 benchmarks; homepage; manual). BWA-MEM is suitable for aligning high-quality long reads ranging from 70 bp to 1 Mbp against a large reference genome such as the human genome.

+ +

The example command below aligns our example data against the GRCh37 genome. The tool automatically locates the index files within the same folder as the reference FASTA file.

+

Illustration of an intermediate step unused in workflow. See piped command.

+
/path/bwa mem -M -t 7 -p /path/human_g1k_v37_decoy.fasta \ 
+6483_snippet_samtofastq_interleaved.fq > 6483_snippet_bwa_mem.sam
+

This command takes the FASTQ file, 6483_snippet_samtofastq_interleaved.fq, and produces an aligned SAM format file, 6483_snippet_unthreaded_bwa_mem.sam, containing read alignment information, an automatically generated program group record and reads sorted in the same order as the input FASTQ file. Aligner-assigned alignment information, flag and tag values reflect each read's or split read segment's best sequence match and does not take into consideration whether pairs are mapped optimally or if a mate is unmapped. Added tags include the aligner-specific XS tag that marks secondary alignment scores in XS:i:# format. This tag is given for each read even when the score is zero and even for unmapped reads. The program group record (@PG) in the header gives the program group ID, group name, group version and recapitulates the given command. Reads are sorted by query name. For the given version of BWA, the aligned file is in SAM format even if given a BAM extension.

+
+

Does the aligned file contain read group information?

+
+

We invoke three options in the command.

+ +

In the example data, all of the 1211 unmapped reads each have an asterisk (*) in column 6 of the SAM record, where a read typically records its CIGAR string. The asterisk represents that the CIGAR string is unavailable. The several asterisked reads I examined are recorded as mapping exactly to the same location as their mapped mates but with MAPQ of zero. Additionally, the asterisked reads had varying noticeable amounts of low base qualities, e.g. strings of #s, that corresponded to original base quality calls and not those changed by SamToFastq. This accounting by BWA allows these pairs to always list together, even when the reads are coordinate-sorted, and leaves a pointer to the genomic mapping of the mate of the unmapped read. For the example read pair shown below, comparing sequences shows no apparent overlap, with the highest identity at 72% over 25 nts.

+

After MarkIlluminaAdapters (step 2)

+ +

After BWA-MEM (step 3)

+ +

After MergeBamAlignment (step 3)

+ +

back to top

+
+

+

3C. Restore altered data and apply & adjust meta information using MergeBamAlignment

+

MergeBamAlignment is a beast of a tool, so its introduction is longer. It does more than is implied by its name. Explaining these features requires I fill you in on some background.

+

Broadly, the tool merges defined information from the unmapped BAM (uBAM, step 1) with that of the aligned BAM (step 3) to conserve read data, e.g. original read information and base quality scores. The tool also generates additional meta information based on the information generated by the aligner, which may alter aligner-generated designations, e.g. mate information and secondary alignment flags. The tool then makes adjustments so that all meta information is congruent, e.g. read and mate strand information based on proper mate designations. We ascribe the resulting BAM as clean.

+

Specifically, the aligned BAM generated in step 3 lacks read group information and certain tags--the UQ (Phred likelihood of the segment), MC (CIGAR string for mate) and MQ (mapping quality of mate) tags. It has hard-clipped sequences from split reads and altered base qualities. The reads also have what some call mapping artifacts but what are really just features we should not expect from our aligner. For example, the meta information so far does not consider whether pairs are optimally mapped and whether a mate is unmapped (in reality or for accounting purposes). Depending on these assignments, MergeBamAlignment adjusts the read and read mate strand orientations for reads in a proper pair. Finally, the alignment records are sorted by query name. We would like to fix all of these issues before taking our data to a variant discovery workflow.

+

Enter MergeBamAlignment. As the tool name implies, MergeBamAlignment applies read group information from the uBAM and retains the program group information from the aligned BAM. In restoring original sequences, the tool adjusts CIGAR strings from hard-clipped to soft-clipped. If the alignment file is missing reads present in the unaligned file, then these are retained as unmapped records. Additionally, MergeBamAlignment evaluates primary alignment designations according to a user-specified strategy, e.g. for optimal mate pair mapping, and changes secondary alignment and mate unmapped flags based on its calculations. Additional for desired congruency. I will soon explain these and additional changes in more detail and show a read record to illustrate.

+
+

Consider what PRIMARY_ALIGNMENT_STRATEGY option best suits your samples. MergeBamAlignment applies this strategy to a read for which the aligner has provided more than one primary alignment, and for which one is designated primary by virtue of another record being marked secondary. MergeBamAlignment considers and switches only existing primary and secondary designations. Therefore, it is critical that these were previously flagged.

+
+

A read with multiple alignment records may map to multiple loci or may be chimeric--that is, splits the alignment. It is possible for an aligner to produce multiple alignments as well as multiple primary alignments, e.g. in the case of a linear alignment set of split reads. When one alignment, or alignment set in the case of chimeric read records, is designated primary, others are designated either secondary or supplementary. Invoking the -M option, we had BWA mark the record with the longest aligning section of split reads as primary and all other records as secondary. MergeBamAlignment further adjusts this secondary designation and adds the read mapped in proper pair (0x2) and mate unmapped (0x8) flags. The tool then adjusts the strand orientation flag for a read (0x10) and it proper mate (0x20).

+

In the command, we change CLIP_ADAPTERS, MAX_INSERTIONS_OR_DELETIONS and PRIMARY_ALIGNMENT_STRATEGY values from default, and invoke other optional parameters. The path to the reference FASTA given by R should also contain the corresponding .dict sequence dictionary with the same prefix as the reference FASTA. It is imperative that both the uBAM and aligned BAM are both sorted by queryname.

+

Illustration of an intermediate step unused in workflow. See piped command.

+
java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+R=/path/Homo_sapiens_assembly19.fasta \ 
+UNMAPPED_BAM=6383_snippet_revertsam.bam \ 
+ALIGNED_BAM=6483_snippet_bwa_mem.sam \ #accepts either SAM or BAM
+O=6483_snippet_mergebamalignment.bam \
+CREATE_INDEX=true \ #standard Picard option for coordinate-sorted outputs
+ADD_MATE_CIGAR=true \ #default; adds MC tag
+CLIP_ADAPTERS=false \ #changed from default
+CLIP_OVERLAPPING_READS=true \ #default; soft-clips ends so mates do not extend past each other
+INCLUDE_SECONDARY_ALIGNMENTS=true \ #default
+MAX_INSERTIONS_OR_DELETIONS=-1 \ #changed to allow any number of insertions or deletions
+PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ #changed from default BestMapq
+ATTRIBUTES_TO_RETAIN=XS \ #specify multiple times to retain tags starting with X, Y, or Z 
+TMP_DIR=/path/shlee #optional to process large files
+

This generates a coordinate-sorted and clean BAM, 6483_snippet_mergebamalignment.bam, and corresponding .bai index. These are ready for analyses starting with MarkDuplicates. The two bullet-point lists below describe changes to the resulting file. The first list gives general comments on select parameters and the second describes some of the notable changes to our example data.

+

Comments on select parameters

+ +

Description of changes to our example data

+ +

The example below shows a read pair for which MergeBamAlignment adjusts multiple information fields, and these changes are described in the remaining bullet points.

+ +
+

Two distinct classes of mate unmapped read records are now present in our example file: (1) reads whose mates truly failed to map and are marked by an asterisk * in column 6 of the SAM record and (2) multimapping reads whose mates are in fact mapped but in a proper pair that excludes the particular read record. Each of these two classes of mate unmapped reads can contain multimapping reads that map to two or more locations.

+
+

Comparing 6483_snippet_bwa_mem.sam and 6483_snippet_mergebamalignment.bam, we see the numberunmapped reads remains the same at 1211, while the number of records with the mate unmapped flag increases by 1359, from 1276 to 2635. These now account for 0.951% of the 276,970 read records.

+
+

For 6483_snippet_mergebamalignment.bam, how many additional unique reads become mate unmapped?

+
+

After BWA-MEM alignment

+ +

After MergeBamAlignment

+ +

back to top

+
+

+

3D. Pipe SamToFastq, BWA-MEM and MergeBamAlignment to generate a clean BAM

+

We pipe the three tools described above to generate an aligned BAM file sorted by query name. In the piped command, the commands for the three processes are given together, separated by a vertical bar |. We also replace each intermediate output and input file name with a symbolic path to the system's output and input devices, here /dev/stdout and /dev/stdin, respectively. We need only provide the first input file and name the last output file.

+

Before using a piped command, we should ask UNIX to stop the piped command if any step of the pipe should error and also return to us the error messages. Type the following into your shell to set these UNIX options.

+
set -o pipefail
+

Overview of command structure

+
[SamToFastq] | [BWA-MEM] | [MergeBamAlignment]
+

Piped command

+
java -Xmx8G -jar /path/picard.jar SamToFastq \
+I=6483_snippet_markilluminaadapters.bam \
+FASTQ=/dev/stdout \
+CLIPPING_ATTRIBUTE=XT CLIPPING_ACTION=2 INTERLEAVE=true NON_PF=true \
+TMP_DIR=/path/shlee | \ 
+/path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta /dev/stdin | \  
+java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+ALIGNED_BAM=/dev/stdin \
+UNMAPPED_BAM=6383_snippet_revertsam.bam \ 
+OUTPUT=6483_snippet_piped.bam \
+R=/path/Homo_sapiens_assembly19.fasta CREATE_INDEX=true ADD_MATE_CIGAR=true \
+CLIP_ADAPTERS=false CLIP_OVERLAPPING_READS=true \
+INCLUDE_SECONDARY_ALIGNMENTS=true MAX_INSERTIONS_OR_DELETIONS=-1 \
+PRIMARY_ALIGNMENT_STRATEGY=MostDistant ATTRIBUTES_TO_RETAIN=XS \
+TMP_DIR=/path/shlee
+

The piped output file, 6483_snippet_piped.bam, is for all intensive purposes the same as 6483_snippet_mergebamalignment.bam, produced by running MergeBamAlignment separately without piping. However, the resulting files, as well as new runs of the workflow on the same data, have the potential to differ in small ways because each uses a different alignment instance.

+
+

How do these small differences arise?

+
+

Counting the number of mate unmapped reads shows that this number remains unchanged for the two described workflows. Two counts emitted at the end of the process updates, that also remain constant for these instances, are the number of alignment records and the number of unmapped reads.

+
INFO    2015-12-08 17:25:59 AbstractAlignmentMerger Wrote 275759 alignment records and 1211 unmapped reads.
+

back to top

+
+

Some final remarks

+

We have produced a clean BAM that is coordinate-sorted and indexed, in an efficient manner that minimizes processing time and storage needs. The file is ready for marking duplicates as outlined in Tutorial#2799. Additionally, we can now free up storage on our file system by deleting the original file we started with, the uBAM and the uBAMXT. We sleep well at night knowing that the clean BAM retains all original information.

+

We have two final comments (1) on multiplexed samples and (2) on fitting this workflow into a larger workflow.

+

For multiplexed samples, first perform the workflow steps on a file representing one sample and one lane. Then mark duplicates. Later, after some steps in the GATK's variant discovery workflow, and after aggregating files from the same sample from across lanes into a single file, mark duplicates again. These two marking steps ensure you find both optical and PCR duplicates.

+

For workflows that nestle this pipeline, consider additionally optimizing java jar's parameters for SamToFastq and MergeBamAlignment. For example, the following are the additional settings used by the Broad Genomics Platform in the piped command for very large data sets.

+
    java -Dsamjdk.buffer_size=131072 -Dsamjdk.compression_level=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx128m -jar /path/picard.jar SamToFastq ...
+
+    java -Dsamjdk.buffer_size=131072 -Dsamjdk.use_async_io=true -Dsamjdk.compression_level=1 -XX:+UseStringCache -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx5000m -jar /path/picard.jar MergeBamAlignment ...
+

I give my sincere thanks to Julian Hess, the GATK team and the Data Sciences and Data Engineering (DSDE) team members for all their help in writing this and related documents.

+

back to top

+
+

\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md b/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md new file mode 100644 index 000000000..771cc88e3 --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md @@ -0,0 +1,273 @@ +## (How to) Map reads to a reference with alternate contigs like GRCh38 + +http://gatkforums.broadinstitute.org/gatk/discussion/8017/how-to-map-reads-to-a-reference-with-alternate-contigs-like-grch38 + +

Document is in BETA. It may be incomplete and/or inaccurate. Post suggestions to the Comments section and be sure to read about updates also within the Comments section.

+
+

This exploratory tutorial provides instructions and example data to map short reads to a reference genome with alternate haplotypes. Instructions are suitable for indexing and mapping reads to GRCh38.

+

► If you are unfamiliar with terms that describe reference genome components, or GRCh38 alternate haplotypes, take a few minutes to study the Dictionary entry Reference Genome Components.

+

► For an introduction to GRCh38, see Blog#8180.

+

Specifically, the tutorial uses BWA-MEM to index and map simulated reads for three samples to a mini-reference composed of a GRCh38 chromosome and alternate contig (sections 1–3). We align in an alternate contig aware (alt-aware) manner, which we also call alt-handling. This is the main focus of the tutorial.

+

The decision to align to a genome with alternate haplotypes has implications for variant calling. We discuss these in section 5 using the callset generated with the optional tutorial steps outlined in section 4. Because we strategically placed a number of SNPs on the sequence used to simulate the reads, in both homologous and divergent regions, we can use the variant calls and their annotations to examine the implications of analysis approaches. To this end, the tutorial fast-forwards through pre-processing and calls variants for a trio of samples that represents the combinations of the two reference haplotypes (the PA and the ALT). This first workflow (tutorial_8017) is suitable for calling variants on the primary assembly but is insufficient for capturing variants on the alternate contigs.

+

For those who are interested in calling variants on the alternate contigs, we also present a second and a third workflow in section 6. The second workflow (tutorial_8017_toSE) takes the processed BAM from the first workflow, makes some adjustments to the reads to maximize their information, and calls variants on the alternate contig. This approach is suitable for calling on ~75% of the non-HLA alternate contigs or ~92% of loci with non-HLA alternate contigs (see table in section 6). The third workflow (tutorial_8017_postalt) takes the alt-aware alignments from the first workflow and performs a postalt-processing step as well as the same adjustment from the second workflow. Postalt-processing uses the bwa-postalt.js javascript program that Heng Li provides as a companion to BWA. This allows for variant calling on all alternate contigs including HLA alternate contigs.

+

The tutorial ends by comparing the difference in call qualities from the multiple workflows for the given example data and discusses a few caveats of each approach.

+

+

► The three workflows shown in the diagram above are available as WDL scripts in our GATK Tutorials WDL scripts repository.

+
+

Jump to a section

+
    +
  1. Index the reference FASTA for use with BWA-MEM
  2. +
  3. Include the reference ALT index file +☞ What happens if I forget the ALT index file?
  4. +
  5. Align reads with BWA-MEM +☞ How can I tell if a BAM was aligned with alt-handling? +☞ What is the pa tag?
  6. +
  7. (Optional) Add read group information, preprocess to make a clean BAM and call variants
  8. +
  9. How can I tell whether I should consider an alternate haplotype for a given sample? +(5.1) Discussion of variant calls for tutorial_8017
  10. +
  11. My locus includes an alternate haplotype. How can I call variants on alt contigs? +(6.1) Variant calls for tutorial_8017_toSE +(6.2) Variant calls for tutorial_8017_postalt
  12. +
  13. Related resources
  14. +
+

Tools involved

+ +

Download example data

+

Download tutorial_8017.tar.gz, either from the GoogleDrive or from the ftp site. To access the ftp site, leave the password field blank. The data tarball contains the paired FASTQ reads files for three samples. It also contains a mini-reference chr19_chr19_KI270866v1_alt.fasta and corresponding .dict dictionary, .fai index and six BWA indices including the .alt index. The data tarball includes the output files from the workflow that we care most about. These are the aligned SAMs, processed and indexed BAMs and the final multisample VCF callsets from the three presented workflows.

+

The mini-reference contains two contigs subset from human GRCh38: chr19 and chr19_KI270866v1_alt. The ALT contig corresponds to a diverged haplotype of chromosome 19. Specifically, it corresponds to chr19:34350807-34392977, which contains the glucose-6-phosphate isomerase or GPI gene. Part of the ALT contig introduces novel sequence that lacks a corresponding region in the primary assembly.

+

Using instructions in Tutorial#7859, we simulated paired 2x151 reads to derive three different sample reads that when aligned give roughly 35x coverage for the target primary locus. We derived the sequences from either the 43 kbp ALT contig (sample ALTALT), the corresponding 42 kbp region of the primary assembly (sample PAPA) or both (sample PAALT). Before simulating the reads, we introduced four SNPs to each contig sequence in a deliberate manner so that we can call variants.

+

► Alternatively, you may instead use the example input files and commands with the full GRCh38 reference. Results will be similar with a handful of reads mapping outside of the mini-reference regions. +

+
+

1. Index the reference FASTA for use with BWA-MEM

+

Our example chr19_chr19_KI270866v1_alt.fasta reference already has chr19_chr19_KI270866v1_alt.dict dictionary and chr19_chr19_KI270866v1_alt.fasta.fai index files for use with Picard and GATK tools. BWA requires a different set of index files for alignment. The command below creates five of the six index files we need for alignment. The command calls the index function of BWA on the reference FASTA.

+
bwa index chr19_chr19_KI270866v1_alt.fasta
+

This gives .pac, .bwt, .ann, .amb and .sa index files that all have the same chr19_chr19_KI270866v1_alt.fasta basename. Tools recognize index files within the same directory by their identical basename. In the case of BWA, it uses the basename preceding the .fasta suffix and searches for the index file, e.g. with .bwt suffix or .64.bwt suffix. Depending on which of the two choices it finds, it looks for the same suffix for the other index files, e.g. .alt or .64.alt. Lack of a matching .alt index file will cause BWA to map reads without alt-handling. More on this next.

+

Note that the .64. part is an explicit indication that index files were generated with version 0.6 or later of BWA and are the 64-bit indices (as opposed to files generated by earlier versions, which were 32-bit). This .64. signifier can be added automatically by adding -6 to the bwa index command.

+

+back to top

+
+

2. Include the reference ALT index file

+

Be sure to place the tutorial's mini-ALT index file chr19_chr19_KI270866v1_alt.fasta.alt with the other index files. Also, if it does not already match, change the file basename to match. This is the sixth index file we need for alignment. BWA-MEM uses this file to prioritize primary assembly alignments for reads that can map to both the primary assembly and an alternate contig. See BWA documentation for details.

+ +

For the tutorial, we subset from hs38DH.fa.alt to create a mini-ALT index, chr19_chr19_KI270866v1_alt.fasta.alt. Its contents are shown below.

+ +

+

The record aligns the chr19_KI270866v1_alt contig to the chr19 locus starting at position 34,350,807 and uses CIGAR string nomenclature to indicate the pairwise structure. To interpret the CIGAR string, think of the primary assembly as the reference and the ALT contig sequence as the read. For example, the 11307M at the start indicates 11,307 corresponding sequence bases, either matches or mismatches. The 935S at the end indicates a 935 base softclip for the ALT contig sequence that lacks corresponding sequence in the primary assembly. This is a region that we consider highly divergent or novel. Finally, notice the NM tag that notes the edit distance to the reference.

+

☞ What happens if I forget the ALT index file?

+

If you omit the ALT index file from the reference, or if its naming structure mismatches the other indexes, then your alignments will be equivalent to the results you would obtain if you run BWA-MEM with the -j option. The next section gives an example of what this looks like.

+

+back to top

+
+

3. Align reads with BWA-MEM

+

The command below uses an alt-aware version of BWA and maps reads using BWA's maximal exact match (MEM) option. Because the ALT index file is present, the tool prioritizes mapping to the primary assembly over ALT contigs. In the command, the tutorial's chr19_chr19_KI270866v1_alt.fasta serves as reference; one FASTQ holds the forward reads and the other holds the reverse reads.

+
bwa mem chr19_chr19_KI270866v1_alt.fasta 8017_read1.fq 8017_read2.fq > 8017_bwamem.sam
+

The resulting file 8017_bwamem.sam contains aligned read records.

+

+ +

+

☞ How can I tell if a BAM was aligned with alt-handling?

+

There are two approaches to this question.

+

First, you can view the alignments on IGV and compare primary assembly loci with their alternate contigs. The IGV screenshots to the right show how BWA maps reads with (top) or without (bottom) alt-handling.

+

Second, you can check the alignment SAM. Of two tags that indicate alt-aware alignment, one will persist after preprocessing only if the sample has reads that can map to alternate contigs. The first tag, the AH tag, is in the BAM header section of the alignment file, and is absent after any merging step, e.g. merging with MergeBamAlignment. The second tag, the pa tag, is present for reads that the aligner alt-handles. If a sample does not contain any reads that map equally or preferentially to alternate contigs, then this tag may be absent in a BAM even if the alignments were mapped in an alt-aware manner.

+

Here are three headers for comparison where only one indicates alt-aware alignment.

+

File header for alt-aware alignment. We use this type of alignment in the tutorial. +Each alternate contig's @SQ line in the header will have an AH:* tag to indicate alternate contig handling for that contig. This marking is based on the alternate contig being listed in the .alt index file and alt-aware alignment.

+ +

File header for -j alignment (alt-handling disabled) for example purposes. We do not perform this type of alignment in the tutorial. +Notice the absence of any special tags in the header.

+ +

+

File header for alt-aware alignment after merging with MergeBamAlignment. We use this step in the next section. +Again, notice the absence of any special tags in the header.

+ +

☞ What is the pa tag?

+

For BWA v0.7.15, but not v0.7.13, ALT loci alignment records that can align to both the primary assembly and alternate contig(s) will have a pa tag on the primary assembly alignment. For example, read chr19_KI270866v1_alt_4hetvars_26518_27047_0:0:0_0:0:0_931 of the ALTALT sample has five alignment records only three of which have the pa tag as shown below.

+ +

A brief description of each of the five alignments, in order:

+
    +
  1. First in pair, primary alignment on the primary assembly; AS=146, pa=0.967
  2. +
  3. First in pair, supplementary alignment on the alternate contig; AS=151
  4. +
  5. Second in pair, primary alignment on the primary assembly; AS=120; pa=0.795
  6. +
  7. Second in pair, supplementary alignment on the primary assembly; AS=54; pa=0.358
  8. +
  9. Second in pair, supplementary alignment on the alternate contig; AS=151
  10. +
+

The pa tag measures how much better a read aligns to its best alternate contig alignment versus its primary assembly (pa) alignment. Specifically, it is the ratio of the primary assembly alignment score over the highest alternate contig alignment score. In our example we have primary assembly alignment scores of 146, 120 and 54 and alternate contig alignment scores of 151 and again 151. This gives us three different pa scores that tag the primary assembly alignments: 146/151=0.967, 120/151=0.795 and 54/151=0.358.

+

In our tutorial's workflow, MergeBamAlignment may either change an alignment's pa score or add a previously unassigned pa score to an alignment. The result of this is summarized as follows for the same alignments.

+
    +
  1. pa=0.967 --MergeBamAlignment--> same
  2. +
  3. none --MergeBamAlignment--> assigns pa=0.967
  4. +
  5. pa=0.795 --MergeBamAlignment--> same
  6. +
  7. pa=0.358 --MergeBamAlignment--> replaces with pa=0.795
  8. +
  9. none --MergeBamAlignment--> assigns pa=0.795
  10. +
+

If you want to retain the BWA-assigned pa scores, then add the following options to the workflow commands in section 4.

+ +

In our sample set, after BWA-MEM alignment ALTALT has 1412 pa-tagged alignment records, PAALT has 805 pa-tagged alignment records and PAPA has zero pa-tagged records.

+

+back to top

+
+

4. Add read group information, preprocess to make a clean BAM and call variants

+

The initial alignment file is missing read group information. One way to add that information, which we use in production, is to use MergeBamAlignment. MergeBamAlignment adds back read group information contained in an unaligned BAM and adjusts meta information to produce a clean BAM ready for pre-processing (see Tutorial#6483 for details on our use of MergeBamAlignment). Given the focus here is to showcase BWA-MEM's alt-handling, we refrain from going into the details of all this additional processing. They follow, with some variation, the PairedEndSingleSampleWf pipeline detailed here.

+

Remember these are simulated reads with simulated base qualities. We simulated the reads in a manner that only introduces the planned mismatches, without any errors. Coverage is good at roughly 35x. All of the base qualities for all of the reads are at I, which is, according to this page and this site, an excellent base quality score equivalent to a Sanger Phred+33 score of 40. We can therefore skip base quality score recalibration (BQSR) since the reads are simulated and the dataset is not large enough for recalibration anyway.

+

Here are the commands to obtain a final multisample variant callset. The commands are given for one of the samples. Process each of the three samples independently in the same manner [4.1–4.6] until the last GenotypeGVCFs command [4.7].

+

[4.1] Create unmapped uBAM

+
java -jar picard.jar RevertSam \
+    I=altalt_bwamem.sam O=altalt_u.bam \
+    ATTRIBUTE_TO_CLEAR=XS ATTRIBUTE_TO_CLEAR=XA
+

[4.2] Add read group information to uBAM

+
java -jar picard.jar AddOrReplaceReadGroups \
+    I=altalt_u.bam O=altalt_rg.bam \
+    RGID=altalt RGSM=altalt RGLB=wgsim RGPU=shlee RGPL=illumina
+

[4.3] Merge uBAM with aligned BAM

+
java -jar picard.jar MergeBamAlignment \
+    ALIGNED=altalt_bwamem.sam UNMAPPED=altalt_rg.bam O=altalt_m.bam \
+    R=chr19_chr19_KI270866v1_alt.fasta \
+    SORT_ORDER=unsorted CLIP_ADAPTERS=false \
+    ADD_MATE_CIGAR=true MAX_INSERTIONS_OR_DELETIONS=-1 \
+    PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
+    UNMAP_CONTAMINANT_READS=false \
+    ATTRIBUTES_TO_RETAIN=XS ATTRIBUTES_TO_RETAIN=XA
+

[4.4] Flag duplicate reads

+
java -jar picard.jar MarkDuplicates \
+    INPUT=altalt_m.bam OUTPUT=altalt_md.bam METRICS_FILE=altalt_md.bam.txt \
+    OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 ASSUME_SORT_ORDER=queryname 
+

[4.5] Coordinate sort, fix NM and UQ tags and index for clean BAM +As of Picard v2.7.0, released October 17, 2016, SetNmAndUqTags is no longer available. Use SetNmMdAndUqTags instead.

+
set -o pipefail
+java -jar picard.jar SortSam \
+    INPUT=altalt_md.bam OUTPUT=/dev/stdout SORT_ORDER=coordinate | \
+    java -jar $PICARD SetNmAndUqTags \
+    INPUT=/dev/stdin OUTPUT=altalt_snaut.bam \
+    CREATE_INDEX=true R=chr19_chr19_KI270866v1_alt.fasta
+

[4.6] Call SNP and indel variants in emit reference confidence (ERC) mode per sample using HaplotypeCaller

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -o altalt.g.vcf -I altalt_snaut.bam \
+    -ERC GVCF --max_alternate_alleles 3 --read_filter OverclippedRead \
+    --emitDroppedReads -bamout altalt_hc.bam
+

[4.7] Call genotypes on three samples

+
java -jar GenomeAnalysisTK.jar -T GenotypeGVCFs \
+    -R chr19_chr19_KI270866v1_alt.fasta -o multisample.vcf \
+    --variant altalt.g.vcf --variant altpa.g.vcf --variant papa.g.vcf 
+

The altalt_snaut.bam, HaplotypeCaller's altalt_hc.bam and the multisample multisample.vcf are ready for viewing on IGV.

+

Before getting into the results in the next section, we have minor comments on two filtering options.

+

In our tutorial workflows, we turn off MergeBamAlignment's UNMAP_CONTAMINANT_READS option. If set to true, 68 reads become unmapped for PAPA and 40 reads become unmapped for PAALT. These unmapped reads are those reads caught by the UNMAP_CONTAMINANT_READS filter and their mates. MergeBamAlignment defines contaminant reads as those alignments that are overclipped, i.e. that are softclipped on both ends, and that align with less than 32 bases. Changing the MIN_UNCLIPPED_BASES option from the default of 32 to 22 and 23 restores all of these reads for PAPA and PAALT, respectively. Contaminants are obviously absent for these simulated reads. And so we set UNMAP_CONTAMINANT_READS to false to disable this filtering.

+

HaplotypeCaller's --read_filter OverclippedRead option similarly looks for both-end-softclipped alignments, then filters reads aligning with less than 30 bases. The difference is that HaplotypeCaller only excludes the overclipped alignments from its calling and does not remove mapping information nor does it act on the mate of the filtered alignment. Thus, we keep this read filter for the first workflow. However, for the second and third workflows in section 6, tutorial_8017_toSE and tutorial_8017_postalt, we omit the --read_filter Overclipped option from the HaplotypeCaller command. We also omit the --max_alternate_alleles 3 option for simplicity.

+

+back to top

+
+

5. How can I tell whether I should consider an alternate haplotype?

+

We consider this question only for our GPI locus, a locus we know has an alternate contig in the reference. Here we use the term locus in its biological sense to refer to a contiguous genomic region of interest. The three samples give the alignment and coverage profiles shown on the right.

+

What is immediately apparent from the IGV screenshot is that the scenarios that include the alternate haplotype give a distinct pattern of variant sites to the primary assembly much like a fingerprint. These variants are predominantly heterozygous or homozygous. Looking closely at the 3' region of the locus, we see some alignment coverage anomalies that also show a distinct pattern. The coverage in some of the highly diverged region in the primary assembly drops while in others it increases. If we look at the origin of simulated reads in one of the excess coverage regions, we see that they are from two different regions of the alternate contig that suggests duplicated sequence segments within the alternate locus.

+

The variation pattern and coverage anomalies on the primary locus suggest an alternate haplotype may be present for the locus. We can then confirm the presence of aligned reads, both supplementary and primary, on the alternate locus. Furthermore, if we count the alignment records for each region, e.g. using samtools idxstats, we see the following metrics.

+
                        ALT/ALT     PA/ALT     PA/PA   
+chr19                     10005      10006     10000     
+chr19_KI270866v1_alt       1407        799         0      
+

+

The number of alignments on the alternate locus increases proportionately with alternate contig dosage. All of these factors together suggest that the sample presents an alternate haplotype.

+

5.1 Discussion of variant calls for tutorial_8017

+

The three-sample variant callset gives 54 sites on the primary locus and two additional on the alternate locus for 56 variant sites. All of the eight SNP alleles we introduced are called, with six called on the primary assembly and two called on the alternate contig. Of the 15 expected genotype calls, four are incorrect. Namely, four PAALT calls that ought to be heterozygous are called homozygous variant. These are two each on the primary assembly and on the alternate contig in the region that is highly divergent.

+

► Our production pipelines use genomic intervals lists that exclude GRCh38 alternate contigs from variant calling. That is, variant calling is performed only for contigs of the primary assembly. This calling on even just the primary assembly of GRCh38 brings improvements to analysis results over previous assemblies. For example, if we align and call variants for our simulated reads on GRCh37, we call 50 variant sites with identical QUAL scores to the equivalent calls in our GRCh38 callset. However, this GRCh37 callset is missing six variant calls compared to the GRCh38 callset for the 42 kb locus: the two variant sites on the alternate contig and four variant sites on the primary assembly.

+

Consider the example variants on the primary locus. The variant calls from the primary assembly include 32 variant sites that are strictly homozygous variant in ALTALT and heterozygous variant in PAALT. The callset represents only those reads from the ALT that can be mapped to the primary assembly.

+

In contrast, the two variants in regions whose reads can only map to the alternate contig are absent from the primary assembly callset. For this simulated dataset, the primary alignments present on the alternate contig provide enough supporting reads that allow HaplotypeCaller to call the two variants. However, these variant calls have lower-quality annotation metrics than for those simulated in an equal manner on the primary assembly. We will get into why this is in section 6.

+

Additionally, for our PAALT sample that is heterozygous for an alternate haplotype, the genotype calls in the highly divergent regions are inaccurate. These are called homozygous variant on the primary assembly and on the alternate contig when in fact they are heterozygous variant. These calls have lower genotype scores GQ as well as lower allele depth AD and coverage DP. The table below shows the variant calls for the introduced SNP sites. In blue are the genotype calls that should be heterozygous variant but are instead called homozygous variant. +

+

Here is a command to select out the intentional variant sites that uses SelectVariants:

+
java -jar GenomeAnalysisTK.jar -T SelectVariants \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -V multisample.vcf -o multisample_selectvariants.vcf \
+    -L chr19:34,383,500 -L chr19:34,389,485 -L chr19:34,391,800 -L chr19:34,392,600 \
+    -L chr19_KI270866v1_alt:32,700 -L chr19_KI270866v1_alt:38,700 \
+    -L chr19_KI270866v1_alt:41,700 -L chr19_KI270866v1_alt:42,700 \
+    -L chr19:34,383,486 -L chr19_KI270866v1_alt:32,714 
+

+back to top

+
+

6. My locus includes an alternate haplotype. How can I call variants on alt contigs?

+

If you want to call variants on alternate contigs, consider additional data processing that overcome the following problems.

+ +

Let us talk about these in more detail.

+

Ideally, if we are interested in alternate haplotypes, then we would have ensured we were using the most up-to-date analysis reference genome sequence with the latest patch fixes. Also, whatever approach we take to align and preprocess alignments, if we filter any reads as putative contaminants, e.g. with MergeBamAlignment's option to unmap cross-species contamination, then at this point we would want to fish back into the unmapped reads pool and pull out those reads. Specifically, these would have an SA tag indicating mapping to the alternate contig of interest and an FT tag indicating the reason for unmapping was because MergeBamAlignment's UNMAP_CONTAMINANT_READS option identified them as cross-species contamination. Similarly, we want to make sure not to include HaplotypeCaller's --read_filter OverclippedRead option that we use in the first workflow.

+

As section 5.1 shows, variant calls on the alternate contig are of low quality--they have roughly an order of magnitude lower QUAL scores than what should be equivalent variant calls on the primary assembly.

+

For this exploratory tutorial, we are interested in calling the introduced SNPs with equivalent annotation metrics. Whether they are called on the primary assembly or the alternate contig and whether they are called homozygous variant or heterozygous--let's say these are less important, especially given pinning certain variants from highly homologous regions to one of the loci is nigh impossible with our short reads. To this end, we will use the second workflow shown in the workflows diagram. However, because this solution is limited, we present a third workflow as well.

+

► We present these workflows solely for exploratory purposes. They do not represent any production workflows.

+

Tutorial_8017_toSE uses the processed BAM from our first workflow and allows for calling on singular alternate contigs. That is, the workflow is suitable for calling on alternate contigs of loci with only a single alternate contig like our GPI locus. Tutorial_8017_postalt uses the aligned SAM from the first workflow before processing, and requires separate processing before calling. This third workflow allows for calling on all alternate contigs, even on HLA loci that have numerous contigs per primary locus. However, the callset will not be parsimonious. That is, each alternate contig will greedily represent alignments and it is possible the same variant is called for all the alternate loci for a given primary locus as well as on the primary locus. It is up to the analyst to figure out what to do with the resulting calls.

+

The reason for the divide in these two workflows is in the way BWA assigns mapping quality scores (MAPQ) to multimapping reads. Postalt-processing becomes necessary for loci with two or more alternate contigs because the shared alignments between the primary locus and alternate loci will have zero MAPQ scores. Postalt-processing gives non-zero MAPQ scores to the alignment records. The table presents the frequencies of GRCh38 non-HLA alternate contigs per primary locus. It appears that ~75% of non-HLA alternate contigs are singular to ~92% of primary loci with non-HLA alternate contigs. In terms of bases on the primary assembly, of the ~75 megabases that have alternate contigs, ~64 megabases (85%) have singular non-HLA alternate contigs and ~11 megabases (15%) have multiple non-HLA alternate contigs per locus. Our tutorial's example locus falls under this majority.

+

+

In both alt-aware mapping and postalt-processing, alternate contig alignments have a predominance of mates that map back to the primary assembly. HaplotypeCaller, for good reason, filters reads whose mates map to a different contig. However, we know that GRCh38 artificially represents alternate haplotypes as separate contigs and BWA-MEM intentionally maps these mates back to the primary locus. For comparable calls on alternate contigs, we need to include these alignments in calling. To this end, we have devised a temporary workaround.

+

6.1 Variant calls for tutorial_8017_toSE

+

Here we are only aiming for equivalent calls with similar annotation values for the two variants that are called on the alternate contig. For the solution that we will outline, here are the results.

+

+

Including the mate-mapped-to-other-contig alignments bolsters the variant call qualities for the two SNPs HaplotypeCaller calls on the alternate locus. We see the AD allele depths much improved for ALTALT and PAALT. Corresponding to the increase in reads, the GQ genotype quality and the QUAL score (highlighted in red) indicate higher qualities. For example, the QUAL scores increase from 332 and 289 to 2166 and 1764, respectively. We also see that one of the genotype calls changes. For sample ALTALT, we see a previous no call is now a homozygous reference call (highlighted in blue). This hom-ref call is further from the truth than not having a call as the ALTALT sample should not have coverage for this region in the primary assembly.

+

For our example data, tutorial_8017's callset subset for the primary assembly and tutorial_8017_toSE's callset subset for the alternate contigs together appear to make for a better callset.

+

What solution did we apply? As the workflow's name toSE implies, this approach converts paired reads to single end reads. Specifically, this approach takes the processed and coordinate-sorted BAM from the first workflow and removes the 0x1 paired flag from the alignments. Removing the 0x1 flag from the reads allows HaplotypeCaller to consider alignments whose mates map to a different contig. We accomplish this using a modified script of that presented in Biostars post https://www.biostars.org/p/106668/, indexing with Samtools and then calling with HaplotypeCaller as follows. Note this workaround creates an invalid BAM according to ValidateSamFile. Also, another caveat is that because HaplotypeCaller uses softclipped sequences, any overlapping regions of read pairs will count twice towards variation instead of once. Thus, this step may lead to overconfident calls in such regions.

+

Remove the 0x1 bitwise flag from alignments

+
samtools view -h altalt_snaut.bam | gawk '{printf "%s\t", $1; if(and($2,0x1))
+{t=$2-0x1}else{t=$2}; printf "%s\t" , t; for (i=3; i<NF; i++){printf "%s\t", $i} ; 
+printf "%s\n",$NF}'| samtools view -Sb - > altalt_se.bam
+

Index the resulting BAM

+
samtools index altalt_se.bam
+

Call variants in -ERC GVCF mode with HaplotypeCaller for each sample

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -I altalt_se.bam -o altalt_hc.g.vcf \
+    -ERC GVCF --emitDroppedReads -bamout altalt_hc.bam
+

+

Finally, use GenotypeGVCFs as shown in section 4's command [4.7] for a multisample variant callset. Tutorial_8017_toSE calls 68 variant sites--66 on the primary assembly and two on the alternate contig.

+

6.2 Variant calls for tutorial_8017_postalt

+

BWA's postalt-processing requires the query-grouped output of BWA-MEM. Piping an alignment step with postalt-processing is possible. However, to be able to compare variant calls from an identical alignment, we present the postalt-processing as an add-on workflow that takes the alignment from the first workflow.

+

The command uses the bwa-postalt.js script, which we run through k8, a Javascript execution shell. It then lists the ALT index, the aligned SAM altalt.sam and names the resulting file > altalt_postalt.sam.

+
k8 bwa-postalt.js \
+    chr19_chr19_KI270866v1_alt.fasta.alt \
+    altalt.sam > altalt_postalt.sam
+

The resulting postalt-processed SAM, altalt_postalt.sam, undergoes the same processing as the first workflow (commands 4.1 through 4.7) except that (i) we omit --max_alternate_alleles 3 and --read_filter OverclippedRead options for the HaplotypeCaller command like we did in section 6.1 and (ii) we perform the 0x1 flag removal step from section 6.1.

+

The effect of this postalt-processing is immediately apparent in the IGV screenshots. Previously empty regions are now filled with alignments. Look closely in the highly divergent region of the primary locus. Do you notice a change, albeit subtle, before and after postalt-processing for samples ALTALT and PAALT?

+

These alignments give the calls below for our SNP sites of interest. Here, notice calls are made for more sites--on the equivalent site if present in addition to the design site (highlighted in the first two columns). For the three pairs of sites that can be called on either the primary locus or alternate contig, the variant site QUALs, the INFO field annotation metrics and the sample level annotation values are identical for each pair.

+

+

Postalt-processing lowers the MAPQ of primary locus alignments in the highly divergent region that map better to the alt locus. You can see this as a subtle change in the IGV screenshot. After postalt-processing we see an increase in white zero MAPQ reads in the highly divergent region of the primary locus for ALTALT and PAALT. For ALTALT, this effectively cleans up the variant calls in this region at chr19:34,391,800 and chr19:34,392,600. Previously for ALTALT, these calls contained some reads: 4 and 25 for the first workflow and 0 and 28 for the second workflow. After postalt-processing, no reads are considered in this region giving us ./.:0,0:0:.:0,0,0 calls for both sites.

+

What we omit from examination are the effects of postalt-processing on decoy contig alignments. Namely, if an alignment on the primary assembly aligns better on a decoy contig, then postalt-processing discounts the alignment on the primary assembly by assigning it a zero MAPQ score.

+

To wrap up, here are the number of variant sites called for the three workflows. As you can see, this last workflow calls the most variants at 95 variant sites, with 62 on the primary assembly and 33 on the alternate contig.

+
Workflow                total    on primary assembly    on alternate contig
+tutorial_8017           56       54                      2
+tutorial_8017_toSE      68       66                      2
+tutorial_8017_postalt   95       62                     33
+

+back to top

+
+

7. Related resources

+ +

back to top

+
\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md b/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md new file mode 100644 index 000000000..485c064bc --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md @@ -0,0 +1,158 @@ +## (How to) Mark duplicates with MarkDuplicates or MarkDuplicatesWithMateCigar + +http://gatkforums.broadinstitute.org/gatk/discussion/6747/how-to-mark-duplicates-with-markduplicates-or-markduplicateswithmatecigar + +

+ This tutorial updates Tutorial#2799.

+

Here we discuss two tools, MarkDuplicates and MarkDuplicatesWithMateCigar, that flag duplicates. We provide example data and example commands for you to follow along the tutorial (section 1) and include tips in estimating library complexity for PCR-free samples and patterned flow cell technologies. In section 2, we point out special memory considerations for these tools. In section 3, we highlight the similarities and differences between the two tools. Finally, we get into some details that may be of interest to some that includes comments on the metrics file (section 4).

+
+

To mark duplicates in RNA-Seq data, use MarkDuplicates. Reasons are explained in section 2 and section 3. And if you are considering using MarkDuplicatesWithMateCigar for your DNA data, be sure insert lengths are short and you have a low percentage of split or multi-mapping records.

+
+

Obviously, expect more duplicates for samples prepared with PCR than for PCR-free preparations. Duplicates arise from various sources, including within the sequencing run. As such, even PCR-free data can give rise to duplicates, albeit at low rates, as illustrated here with our example data.

+

Which tool should I use, MarkDuplicates or MarkDuplicatesWithMateCigar? new section 5/25/2016

+

The Best Practices so far recommends MarkDuplicates. However, as always, consider your research goals.

+

If your research uses paired end reads and pre-processing that generates missing mates, for example by application of an intervals list or by removal of reference contigs after the initial alignment, and you wish to flag duplicates for these remaining singletons, then MarkDuplicatesWithMateCigar will flag these for you at the insert level using the mate cigar (MC) tag. MarkDuplicates skips these singletons from consideration.

+

If the qualities by which the representative insert in a duplicate set is selected is important to your analyses, then note that MarkDuplicatesWithMateCigar is limited to prioritizing by the total mapped length of a pair, while MarkDuplicates can use this OR the default sum of base qualities of a pair.

+

If you are still unsure which tool is appropriate, then consider maximizing comparability to previous analyses. The Broad Genomics Platform has used only MarkDuplicates in their production pipelines. MarkDuplicatesWithMateCigar is a newer tool that has yet to gain traction.

+

This tutorial compares the two tools to dispel the circulating notion that the outcomes from the two tools are equivalent and to provide details helpful to researchers in optimizing their analyses.

+

We welcome feedback. Share your suggestions in the Comment section at the bottom of this page.

+
+

Jump to a section

+
    +
  1. Commands for MarkDuplicates and MarkDuplicatesWithMateCigar
  2. +
  3. Slow or out of memory error? Special memory considerations for duplicate marking tools
  4. +
  5. Conceptual overview of duplicate flagging
  6. +
  7. Details of interest to some
  8. +
+
+

Tools involved

+ +

Prerequisites

+ +

Download example data

+ +

Related resources

+ +
+

+

1. Commands for MarkDuplicates and MarkDuplicatesWithMateCigar

+

The following commands take a coordinate-sorted and indexed BAM and return (i) a BAM with the same records in coordinate order and with duplicates marked by the 1024 flag, (ii) a duplication metrics file, and (iii) an optional matching BAI index.

+

For a given file with all MC (mate CIGAR) tags accounted for:

+ +

Use the following commands to flag duplicates for 6747_snippet.bam. These commands produce qualitatively different data.

+

Score duplicate sets based on the sum of base qualities using MarkDuplicates:

+
java -Xmx32G -jar picard.jar MarkDuplicates \
+INPUT=6747_snippet.bam \ #specify multiple times to merge 
+OUTPUT=6747_snippet_markduplicates.bam \
+METRICS_FILE=6747_snippet_markduplicates_metrics.txt \ 
+OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ #changed from default of 100
+CREATE_INDEX=true \ #optional
+TMP_DIR=/tmp
+

Score duplicate sets based on total mapped reference length using MarkDuplicatesWithMateCigar:

+
java -Xmx32G -jar picard.jar MarkDuplicatesWithMateCigar \
+INPUT=6747_snippet.bam \ #specify multiple times to merge
+OUTPUT=6747_snippet_markduplicateswithmatecigar.bam \
+METRICS_FILE=6747_snippet_markduplicateswithmatecigar_metrics.txt \ 
+OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ #changed from default of 100
+CREATE_INDEX=true \ #optional
+TMP_DIR=/tmp
+

Comments on select parameters

+ +

For snippet, the duplication metrics are identical whether marked by MarkDuplicates or MarkDuplicatesWithMateCigar. We have 13.4008% duplication, with 255 unpaired read duplicates and 18,254 paired read duplicates. However, as the screenshot at the top of this page illustrates, and as section 4 explains, the data qualitatively differ.

+

back to top

+
+

+

2. Slow or out of memory error? Special memory considerations for duplicate marking tools

+

The seemingly simple task of marking duplicates is one of the most memory hungry processes, especially for paired end reads. Both tools are compute-intensive and require upping memory compared to other processes.

+

Because of the single-pass nature of MarkDuplicatesWithMateCigar, for a given file its memory requirements can be greater than for MarkDuplicates. What this means is that MarkDuplicatesWithMateCigar streams the duplicate marking routine in a manner that allows for piping. Due to these memory constraints for MarkDuplicatesWithMateCigar, we recommend MarkDuplicates for alignments that have large reference skips, e.g. spliced RNA alignments.

+

For large files, (1) use the Java -Xmx setting and (2) set the environmental variable TMP_DIR for a temporary directory. These options allow the tool to run without slowing down as well as run without causing an out of memory error. For the purposes of this tutorial, commands are given as if the example data is a large file, which we know it is not.

+
    java -Xmx32G -jar picard.jar MarkDuplicates \
+    ... \
+    TMP_DIR=/tmp 
+

These options can be omitted for small files such as the example data and the equivalent command is as follows.

+
    java -jar picard.jar MarkDuplicates ...   
+

Set the java maxheapsize, specified by the -Xmx#G option, to the maximum your system allows.

+

The high memory cost, especially for MarkDuplicatesWithMateCigar, is in part because the tool systematically traverses genomic coordinate intervals for inserts in question, and for every read it marks as a duplicate it must keep track of the mate, which may or may not map nearby, so that reads are marked as pairs with each record emitted in its coordinate turn. In the meanwhile, this information is held in memory, which is the first choice for faster processing, until the memory limit is reached, at which point memory spills to disk. We set this limit high to minimize instances of memory spilling to disk.

+

In the example command, the -Xmx32G Java option caps the maximum heap size, or memory usage, to 32 gigabytes, which is the limit on the server I use. This is in contrast to the 8G setting I use for other processes on the same sample data--a 75G BAM file. To find a system's default maximum heap size, type java -XX:+PrintFlagsFinal -version, and look for MaxHeapSize.

+

Set an additional temporary directory with the TMP_DIR parameter for memory spillage.

+

When the tool hits the memory limit, memory spills to disk. This causes data to traverse in and out of the processor's I/O device, slowing the process down. Disk is a location you specify with the TMP_DIR parameter. If you work on a server separate from where you read and write files to, setting TMP_DIR to the server's local temporary directory (typically /tmp) can reduce processing time compared to setting it to the storage disk. This is because the tool then additionally avoids traversing the network file system when spilling memory. Be sure the TMP_DIR location you specify provides enough storage space. Use df -h to see how much is available.

+

back to top

+
+

+

3. Conceptual overview of duplicate flagging

+

The aim of duplicate marking is to flag all but one of a duplicate set as duplicates and to use duplicate metrics to estimate library complexity. Duplicates have a higher probability of being non-independent measurements from the exact same template DNA. Duplicate inserts are marked by the 0x400 bit (1024 flag) in the second column of a SAM record, for each mate of a pair. This allows downstream GATK tools to exclude duplicates from analyses (most do this by default). Certain duplicates, i.e. PCR and sequencer duplicates, violate assumptions of variant calling and also potentially amplify errors. Removing these, even at the cost of removing serendipitous biological duplicates, allows us to be conservative in calculating the confidence of variants.

+
+

GATK tools allow you to disable the duplicate read filter with -drf DuplicateRead so you can include duplicates in analyses.

+
+

For a whole genome DNA sample, duplicates arise from three sources: (i) in DNA shearing from distinct molecular templates identical in insert mapping, (ii) from PCR amplification of a template (PCR duplicates), and (iii) from sequencing, e.g. optical duplicates. The tools cannot distinguish between these types of duplicates with the exception of optical duplicates. In estimating library complexity, the latter two types of duplicates are undesirable and should each factor differently.

+

When should we not care about duplicates? Given duplication metrics, we can make some judgement calls on the quality of our sample preparation and sequencer run. Of course, we may not expect a complex library if our samples are targeted amplicons. Also, we may expect minimal duplicates if our samples are PCR-free. Or it may be that because of the variation inherent in expression level data, e.g. RNA-Seq, duplicate marking becomes ritualistic. Unless you are certain of your edge case (amplicon sequencing, RNA-Seq allele-specific expression analysis, etc.) where duplicate marking adds minimal value, you should go ahead and mark duplicates. You may find yourself staring at an IGV session trying to visually calculate the strength of the evidence for a variant. We can pat ourselves on the back for having the forethought to systematically mark duplicates and turn on the IGV duplicate filter.

+
+

The Broad's Genomics Platform uses MarkDuplicates twice for multiplexed samples. Duplicates are flagged first per sample per lane to estimate lane-level library complexity, and second to aggregate data per sample while marking all library duplicates. In the second pass, duplicate marking tools again assess all reads for duplicates and overwrite any prior flags.

+
+

Our two duplicate flagging tools share common features but differ at the core. As the name implies, MarkDuplicatesWithMateCigar uses the MC (mate CIGAR) tag for mate alignment information. Unlike MarkDuplicates, it is a single-pass tool that requires pre-computed MC tags.

+ +

back to top

+
+

+

4. Details of interest to some

+

To reach a high target coverage depth, some fraction of sequenced reads will by stochastic means be duplicate reads.

+

Let us hope the truth of a variant never comes down to so few reads that duplicates should matter so. Keep in mind the better evidence for a variant is the presence of overlapping reads that contain the variant. Also, take estimated library complexity at face value--an estimate.

+

Don't be duped by identical numbers. Data from the two tools qualitatively differ.

+

First, let me reiterate that secondary and supplementary alignment records are skipped and never flagged as duplicate.

+

Given a file with no missing mates, each tool identifies the same duplicate sets from primary alignments only and therefore the same number of duplicates. To reiterate, the number of identical loci or duplicate sets and the records within each set are the same for each tool. However, each tool differs in how it decides which insert(s) within a set get flagged and thus which insert remains the representative nondup. Also, if there are ties, the tools may break them differently in that tie-breaking can depend on the sort order of the records in memory.

+ +

Duplicate metrics in brief

+

We can break down the metrics file into two parts: (1) a table of metrics that counts various categories of duplicates and gives the library complexity estimate, and (2) histogram values in two columns.

+

See DuplicationMetrics for descriptions of each metric. For paired reads, duplicates are considered for the insert. For single end reads, duplicates are considered singly for the read, increasing the likelihood of being identified as a duplicate. Given the lack of insert-level information for these singly mapping reads, the insert metrics calculations exclude these.

+

The library complexity estimate only considers the duplicates that remain after subtracting out optical duplicates. For the math to derive estimated library size, see formula (1.2) in Mathematical Notes on SAMtools Algorithms.

+

The histogram values extrapolate the calculated library complexity to a saturation curve plotting the gains in complexity if you sequence additional aliquots of the same library. The first bin's value represents the current complexity.

+

Pair orientation F1R2 is distinct from F2R1 for optical duplicates

+

Here we refer you to a five minute video illustrating what happens at the molecular level in a typical sequencing by synthesis run.

+

What I would like to highlight is that each strand of an insert has a chance to seed a different cluster. I will also point out, due to sequencing chemistry, F1 and R1 reads typically have better base qualities than F2 and R2 reads.

+
+

Optical duplicate designation requires the same pair orientation.

+
+

Let us work out the implications of this for a paired end, unstranded DNA library. During sequencing, within the flow cell, for a particular insert produced by sample preparation, the strands of the insert are separated and each strand has a chance to seed a different cluster. Let's say for InsertAB, ClusterA and ClusterB and for InsertCD, ClusterC and ClusterD. InsertAB and InsertCD are identical in sequence and length and map to the same loci. It is possible InsertAB and InsertCD are PCR duplicates and also possible they represent original inserts. Each strand is then sequenced in the forward and reverse to give four pieces of information in total for the given insert, e.g. ReadPairA and ReadPairB for InsertAB. The pair orientation of these two pairs are reversed--one cluster will give F1R2 and the other will give F2R1 pair orientation. Both read pairs map exactly to the same loci. Our duplicate marking tools consider ReadPairA and ReadPairB in the same duplicate set for regular duplicates but not for optical duplicates. Optical duplicates require identical pair orientation.

+

back to top

+
+

\ No newline at end of file diff --git a/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md b/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md new file mode 100644 index 000000000..0d0b442a4 --- /dev/null +++ b/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md @@ -0,0 +1,87 @@ +## (How to) Simulate reads using a reference genome ALT contig + +http://gatkforums.broadinstitute.org/gatk/discussion/7859/how-to-simulate-reads-using-a-reference-genome-alt-contig + +

This tutorial shows how to generate simulated reads against a specific target sequence. This can be useful, e.g. if you want to simulate reads for an alternate contig in GRCh38/hg38 to see how they end up mapping between the primary assembly versus the alternate contig.

+

We use external tools to accomplish this. In Section 1, we use Samtools to subset the target contig sequence from a reference FASTA file. In Section 2, we use wgsim to generate FASTQ format paired reads against the target contig. The resulting read data is ready for alignment.

+

This tutorial provides example data for you to follow along and includes a mini-reference FASTA. If you are unfamiliar with terms that describe reference genome components, take a few minutes to study the Dictionary entry Reference Genome Components.

+
+

Prerequisites and tools involved

+

This tutorial uses external tools that may require additional dependencies, e.g. the gcc compiler, that may not be available by default on your system.

+ +

Download example data

+ +
+

1. Use Samtools to subset target contig sequence from FASTA reference

+

Each contig in the reference FASTA has a header line beginning with > that identifies the contig sequence that follows. We need the exact representation of this header line to subset the target contig sequence. The UNIX command below lists all such headers for the FASTA file.

+
grep '>' chr19_chr19_KI270866v1_alt.fasta
+

This prints the following for our mini-reference chr19_chr19_KI270866v1_alt.fasta.

+
>chr19
+>chr19_KI270866v1_alt
+

Use the faidx option of Samtools to subset the ALT contig sequence to a new FASTA file, chr19_KI270866v1_alt.fasta.

+
samtools faidx chr19_chr19_KI270866v1_alt.fasta chr19_KI270866v1_alt > chr19_KI270866v1_alt.fasta
+
+

Optionally introduce variants into reads

+

To introduce variants into reads, alter the FASTA sequence at this point before simulating reads. For example, to introduce a simple heterozygous SNP, duplicate the contig information within the file, name the duplicate contig differently, and change the base within the duplicated sequence. Search for the target base's sequence context by using TextEdit's Find function. Keep in mind FASTA file sequences contain line breaks.

+

To generate an alternate FASTA reference based on a VCF of variants, see GATK’s FastaAlternateReferenceMaker.

+
+

2. Use wgsim to simulate FASTQ paired reads against the target contig FASTA

+

Generate simulated reads from chr19_KI270866v1_alt.fasta with the following command.

+
wgsim -1151 -2151 -d500 -r0 -e0 -N10000 -R0 -X0 chr19_KI270866v1_alt.fasta 7859_GPI.read1.fq 7859_GPI.read2.fq
+

This gives two FASTQ files, 7859_GPI.read1.fq and 7859_GPI.read2.fq, one for each mate of the paired reads.

+ +

For a 43 kb contig, 10K x 2 x 151 reads should give us ~70x hypothetical coverage. Here are two pairs of reads from 7859_GPI.read1.fq and 7859_GPI.read2.fq.

+

7859_GPI.read1.fq

+
@chr19_KI270866v1_alt_40173_40622_0:0:0_0:0:0_0/1
+AGGTATGAGGATCTGGGTCTTCCCGTGTCTGAGTAGGTAGCACCTGGCACAGGTATGAGGATATGGGTCTTCCATGTCTGAGGAGGTAGCACCTGGCACAGATATGAGGATCTGCGTCTTCCAGTGTTTGAGGAGGTGAGTTTGGACTCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/1
+CACCACTGCTGAGCTCAGGCAAGTGCACAAGGAAAGCTGTGGCTCACTGCTCGGCTCCAGCAGAGGTGGTCCCATGGACCACCTGTTGCTACAGAGGGGTCGGCAGCCCTGTCACTCAAGGCAGGGTTTGCTCTGCAAGCTGCCCCAGCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+

7859_GPI.read2.fq

+
@chr19_KI270866v1_alt_40173_40622_0:0:0_0:0:0_0/2
+AGGGCCAGATCACACCTCCTCAGATATTGACCGACCCAGATCCTTATACCTGCACCAGATCCTACCTCCTCAGGCATTGACAGATCCAGATCCTTATACTTGTGCCAGATCCTACCTCCTTAGACATGGACAGACCCAGATCCTCATACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/2
+AGGCCCATGAGGTCAGGTCAGTGTTTATTGAGTACCTGCTGCATACCTAGCTTGGGGAAAGGTAGAGAGGCCCTCAGAGAGGCTTGGAGGGCAAGAGCAACCCAGGCAGGATGAGGGCTCCACTTCCACCTGAGGGCGGGCTGAGCTTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+

All the bases of all the reads from a simulation have the same base quality, and in this instance each base quality is I. Notice the read names of the simulated reads contain useful information, e.g the last read name @chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/2 consists of the following.

+ +
+

Related resources

+ +
\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md b/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md new file mode 100644 index 000000000..5826aa5ea --- /dev/null +++ b/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md @@ -0,0 +1,107 @@ +## (howto) Apply hard filters to a call set + +http://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set + +

Objective

+

Apply hard filters to a variant callset that is too small for VQSR or for which truth/training sets are not available.

+

Caveat

+

This document is intended to illustrate how to compose and run the commands involved in applying the hard filtering method. The annotations and values used may not reflect the most recent recommendations. Be sure to read the documentation about why you would use hard filters and how to understand and improve upon the generic hard filtering recommendations that we provide.

+

Steps

+
    +
  1. Extract the SNPs from the call set
  2. +
  3. Determine parameters for filtering SNPs
  4. +
  5. Apply the filter to the SNP call set
  6. +
  7. Extract the Indels from the call set
  8. +
  9. Determine parameters for filtering indels
  10. +
  11. Apply the filter to the Indel call set
  12. +
+
+

1. Extract the SNPs from the call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T SelectVariants \ 
+    -R reference.fa \ 
+    -V raw_variants.vcf \ 
+    -selectType SNP \ 
+    -o raw_snps.vcf 
+

Expected Result

+

This creates a VCF file called raw_snps.vcf, containing just the SNPs from the original file of raw variants.

+
+

2. Determine parameters for filtering SNPs

+

SNPs matching any of these conditions will be considered bad and filtered out, i.e. marked FILTER in the output VCF file. The program will specify which parameter was chiefly responsible for the exclusion of the SNP using the culprit annotation. SNPs that do not match any of these conditions will be considered good and marked PASS in the output VCF file.

+ +

This is the variant confidence (from the QUAL field) divided by the unfiltered depth of non-reference samples.

+ +

Phred-scaled p-value using Fisher’s Exact Test to detect strand bias (the variation being seen on only the forward or only the reverse strand) in the reads. More bias is indicative of false positive calls.

+ +

This is the Root Mean Square of the mapping quality of the reads across all samples.

+ +

This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele). Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, i.e. this will only be applied to heterozygous calls.

+ +

This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, i.e. this will only be applied to heterozygous calls.

+
+

3. Apply the filter to the SNP call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T VariantFiltration \ 
+    -R reference.fa \ 
+    -V raw_snps.vcf \ 
+    --filterExpression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \ 
+    --filterName "my_snp_filter" \ 
+    -o filtered_snps.vcf 
+

Expected Result

+

This creates a VCF file called filtered_snps.vcf, containing all the original SNPs from the raw_snps.vcf file, but now the SNPs are annotated with either PASS or FILTER depending on whether or not they passed the filters.

+

For SNPs that failed the filter, the variant annotation also includes the name of the filter. That way, if you apply several different filters (simultaneously or sequentially), you can keep track of which filter(s) each SNP failed, and later you can retrieve specific subsets of your calls using the SelectVariants tool. To learn more about composing different types of filtering expressions and retrieving subsets of variants using SelectVariants, please see the online GATK documentation.

+
+

4. Extract the Indels from the call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T SelectVariants \ 
+    -R reference.fa \ 
+    -V raw_HC_variants.vcf \ 
+    -selectType INDEL \ 
+    -o raw_indels.vcf 
+

Expected Result

+

This creates a VCF file called raw_indels.vcf, containing just the Indels from the original file of raw variants.

+
+

5. Determine parameters for filtering Indels.

+

Indels matching any of these conditions will be considered bad and filtered out, i.e. marked FILTER in the output VCF file. The program will specify which parameter was chiefly responsible for the exclusion of the indel using the culprit annotation. Indels that do not match any of these conditions will be considered good and marked PASS in the output VCF file.

+ +

This is the variant confidence (from the QUAL field) divided by the unfiltered depth of non-reference samples.

+ +

Phred-scaled p-value using Fisher’s Exact Test to detect strand bias (the variation being seen on only the forward or only the reverse strand) in the reads. More bias is indicative of false positive calls.

+ +

This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, i.e. this will only be applied to heterozygous calls.

+
+

6. Apply the filter to the Indel call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T VariantFiltration \ 
+    -R reference.fa \ 
+    -V raw_indels.vcf \ 
+    --filterExpression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" \ 
+    --filterName "my_indel_filter" \ 
+    -o filtered_indels.vcf 
+

Expected Result

+

This creates a VCF file called filtered_indels.vcf, containing all the original Indels from the raw_indels.vcf file, but now the Indels are annotated with either PASS or FILTER depending on whether or not they passed the filters.

+

For Indels that failed the filter, the variant annotation also includes the name of the filter. That way, if you apply several different filters (simultaneously or sequentially), you can keep track of which filter(s) each Indel failed, and later you can retrieve specific subsets of your calls using the SelectVariants tool. To learn more about composing different types of filtering expressions and retrieving subsets of variants using SelectVariants, please see the online GATK documentation.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md b/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md new file mode 100644 index 000000000..f97b68818 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md @@ -0,0 +1,50 @@ +## (howto) Call variants with HaplotypeCaller + +http://gatkforums.broadinstitute.org/gatk/discussion/2803/howto-call-variants-with-haplotypecaller + +

Objective

+

Call variants on a single genome with the HaplotypeCaller, producing a raw (unfiltered) VCF.

+

Caveat

+

This is meant only for single-sample analysis. To analyze multiple samples, see the Best Practices documentation on joint analysis.

+

Prerequisites

+ +

Steps

+
    +
  1. Determine the basic parameters of the analysis
  2. +
  3. Call variants in your sequence data
  4. +
+
+

1. Determine the basic parameters of the analysis

+

If you do not specify these parameters yourself, the program will use default values. However we recommend that you set them explicitly because it will help you understand how the results are bounded and how you can modify the program's behavior.

+ +

This specifies how we want the program to determine the alternate alleles to use for genotyping. In the default DISCOVERY mode, the program will choose the most likely alleles out of those it sees in the data. In GENOTYPE_GIVEN_ALLELES mode, the program will only use the alleles passed in from a VCF file (using the -alleles argument). This is useful if you just want to determine if a sample has a specific genotype of interest and you are not interested in other alleles.

+ +

This is the minimum confidence threshold (phred-scaled) at which the program should emit sites that appear to be possibly variant.

+ +

This is the minimum confidence threshold (phred-scaled) at which the program should emit variant sites as called. If a site's associated genotype has a confidence score lower than the calling threshold, the program will emit the site as filtered and will annotate it as LowQual. This threshold separates high confidence calls from low confidence calls.

+

The terms "called" and "filtered" are tricky because they can mean different things depending on context. In ordinary language, people often say a site was called if it was emitted as variant. But in the GATK's technical language, saying a site was called means that that site passed the confidence threshold test. For filtered, it's even more confusing, because in ordinary language, when people say that sites were filtered, they usually mean that those sites successfully passed a filtering test. However, in the GATK's technical language, the same phrase (saying that sites were filtered) means that those sites failed the filtering test. In effect, it means that those would be filtered out if the filter was used to actually remove low-confidence calls from the callset, instead of just tagging them. In both cases, both usages are valid depending on the point of view of the person who is reporting the results. So it's always important to check what is the context when interpreting results that include these terms.

+
+

2. Call variants in your sequence data

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T HaplotypeCaller \ 
+    -R reference.fa \ 
+    -I preprocessed_reads.bam \  
+    -L 20 \ 
+    --genotyping_mode DISCOVERY \ 
+    -stand_emit_conf 10 \ 
+    -stand_call_conf 30 \ 
+    -o raw_variants.vcf 
+

Note that -L specifies that we only want to run the command on a subset of the data (here, chromosome 20). This is useful for testing as well as other purposes, as documented here. For example, when running on exome data, we use -L to specify a file containing the list of exome targets corresponding to the capture kit that was used to generate the exome libraries.

+

Expected Result

+

This creates a VCF file called raw_variants.vcf, containing all the sites that the HaplotypeCaller evaluated to be potentially variant. Note that this file contains both SNPs and Indels.

+

Although you now have a nice fresh set of variant calls, the variant discovery stage is not over. The distinctions made by the caller itself between low-confidence calls and the rest is very primitive, and should not be taken as a definitive guide for filtering. The GATK callers are designed to be very lenient in calling variants, so it is extremely important to apply one of the recommended filtering methods (variant recalibration or hard-filtering), in order to move on to downstream analyses with the highest-quality call set possible.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md b/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md new file mode 100644 index 000000000..db439b0df --- /dev/null +++ b/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md @@ -0,0 +1,51 @@ +## (howto) Call variants with the UnifiedGenotyper + +http://gatkforums.broadinstitute.org/gatk/discussion/2804/howto-call-variants-with-the-unifiedgenotyper + +

Note: the UnifiedGenotyper has been replaced by HaplotypeCaller, which is a much better tool. UG is still available but you should really consider using HC instead.

+

Objective

+

Call variants on a haploid genome with the UnifiedGenotyper, producing a raw (unfiltered) VCF.

+

Prerequisites

+ +

Steps

+
    +
  1. Determine the basic parameters of the analysis
  2. +
  3. Call variants in your sequence data
  4. +
+
+

1. Determine the basic parameters of the analysis

+

If you do not specify these parameters yourself, the program will use default values. However we recommend that you set them explicitly because it will help you understand how the results are bounded and how you can modify the program's behavior.

+ +

In its basic use, this is the ploidy (number of chromosomes) per sample. By default it is set to 2, to process diploid organisms' genomes, but it can be set to any other desired value, starting at 1 for haploid organisms, and up for polyploids. This argument can also be used to handle pooled data. For that purpose, you'll need to set -ploidy to Number of samples in each pool * Sample Ploidy. There is no fixed upper limit, but keep in mind that high-level ploidy will increase processing times since the calculations involved are more complex. For full details on how to process pooled data, see Eran et al. (in preparation).

+ +

This is the model that the program will use to calculate the genotype likelihoods. By default, it is set to SNP, but it can also be set to INDEL or BOTH. If set to BOTH, both SNPs and Indels will be called in the same run and be output to the same variants file.

+ +

This is the minimum confidence threshold (phred-scaled) at which the program should emit sites that appear to be possibly variant.

+ +

This is the minimum confidence threshold (phred-scaled) at which the program should emit variant sites as called. If a site's associated genotype has a confidence score lower than the calling threshold, the program will emit the site as filtered and will annotate it as LowQual. This threshold separates high confidence calls from low confidence calls.

+

The terms called and filtered are tricky because they can mean different things depending on context. In ordinary language, people often say a site was called if it was emitted as variant. But in the GATK's technical language, saying a site was called means that that site passed the confidence threshold test. For filtered, it's even more confusing, because in ordinary language, when people say that sites were filtered, they usually mean that those sites successfully passed a filtering test. However, in the GATK's technical language, the same phrase (saying that sites were filtered) means that those sites failed the filtering test. In effect, it means that those would be filtered out if the filter was used to actually remove low-confidence calls from the callset, instead of just tagging them. In both cases, both usages are valid depending on the point of view of the person who is reporting the results. So it's always important to check what is the context when interpreting results that include these terms.

+
+

2. Call variants in your sequence data

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T UnifiedGenotyper \ 
+    -R haploid_reference.fa \ 
+    -I haploid_reads.bam \ 
+    -L 20 \ 
+    -glm BOTH \ 
+    --stand_call_conf 30 \ 
+    --stand_emit_conf 10 \ 
+    -o raw_ug_variants.vcf 
+

This creates a VCF file called raw_ug_variants.vcf, containing all the sites that the UnifiedGenotyper evaluated to be potentially variant.

+

Note that -L specifies that we only want to run the command on a subset of the data (here, chromosome 20). This is useful for testing as well as other purposes. For example, when running on exome data, we use -L to specify a file containing the list of exome targets corresponding to the capture kit that was used to generate the exome libraries.

+

Although you now have a nice fresh set of variant calls, the variant discovery stage is not over. The distinctions made by the caller itself between low-confidence calls and the rest is very primitive, and should not be taken as a definitive guide for filtering. The GATK callers are designed to be very lenient in calling variants, so it is extremely important to apply one of the recommended filtering methods (variant recalibration or hard-filtering), in order to move on to downstream analyses with the highest-quality call set possible.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md b/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md new file mode 100644 index 000000000..e45ae3b40 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md @@ -0,0 +1,262 @@ +## (howto) Discover variants with GATK - A GATK Workshop Tutorial + +http://gatkforums.broadinstitute.org/gatk/discussion/7869/howto-discover-variants-with-gatk-a-gatk-workshop-tutorial + +

GATK TUTORIAL :: Variant Discovery :: Worksheet

+

June 2016 - GATK 3.6

+

This tutorial covers material taught at GATK workshops, and focuses on key steps of the GATK Best Practices for Germline SNP and Indel Discovery in Whole Genomes and Exomes. If you aren't already, please set up your computer using the workshop-specific installation instructions. You can find additional background information relevant to this tutorial in the Variant Discovery Appendix.

+ +

Our main purpose is to demonstrate an effective workflow for calling germline SNPs and indels in cohorts of multiple samples. This workflow can be applied to whole genomes as well as exomes and other targeted sequencing datasets.

+

We’ll start by examining the differences between data types (whole genomes, exomes and RNAseq) to highlight the properties of the data that influence what we need to do to analyze it as well as what we can expect to get out of it.

+

Once we understand our data, we will demonstrate how key features of the HaplotypeCaller enable it to produce better results than position-based callers like UnifiedGenotyper. In particular, we’ll show how local assembly of haplotypes and realignment of reads are crucial to producing superior indel calls. Along the way we’ll show you useful tips and tricks for troubleshooting variant calls with HaplotypeCaller and the IGV genome browser.

+

All this will build up to demonstrating the GVCF workflow for joint variant analysis, as applied to a trio of whole-genome samples. We hope to convince you that this workflow has substantial practical advantages over a joint analysis that is achieved by calling variants simultaneously on all samples, while producing results that are just as good or even better.

+

The tutorial dataset is available for public download here.

+
+

Table of Contents

+
    +
  1. WORKING WITH DATASETS FROM DIFFERENT EXPERIMENTAL DESIGNS +1.1 The genome reference: b37 +1.2 The test sample: NA12878 Whole-Genome Sequence (WGS) +1.3 For comparison: NA12878 Exome Sequence +1.4 Another comparison: NA12878 RNAseq
  2. +
  3. DIAGNOSING UNKNOWN BAMS +2.1 View header and check read groups +2.2 Validate the file
  4. +
  5. VARIANT DISCOVERY +3.1 Call variants with a position-based caller: UnifiedGenotyper +3.2 Call variants with HaplotypeCaller + 3.2.1 View realigned reads and assembled haplotypes + 3.2.2 Run more samples +3.3 Run HaplotypeCaller on a single bam file in GVCF mode + 3.3.1 View resulting GVCF file in the terminal + 3.3.2 View variants in IGV + 3.3.3 Run joint genotyping on the CEU Trio GVCFs to generate the final VCF + 3.3.4 View variants in IGV and compare callsets
  6. +
+
+

1 WORKING WITH DATASETS FROM DIFFERENT EXPERIMENTAL DESIGNS

+

1.1 The genome reference: b37

+

We are using a version of the b37 human genome reference containing only a subset of chromosome 20, which we prepared specially for this tutorial in order to provide a reasonable bundle size for download. It is accompanied by its index and sequence dictionary.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ref/
human_g1k_b37_20.fastagenome reference
human_g1k_b37_20.fasta.faifasta index
human_g1k_b37_20.dictsequence dictionary
+

Open up IGV, and load the Human (1kg, b37+decoy) reference available on the IGV server (Genomes>Load Genome from Server). We use this reference in IGV because it has a pre-loaded gene track, whereas our custom chromosome-20-only reference does not.

+

+

1.2 The test sample: NA12878 Whole-Genome Sequence (WGS)

+

The biological sample from which the example sequence data was obtained comes from individual NA12878, a member of a 17 sample collection known as CEPH Pedigree 1463, taken from a family in Utah, USA. A trio of two parents and one child from this data set is often referred to as the CEU Trio and is widely used as an evaluation standard (e.g. in the Illumina Platinum Genomes dataset). Note that an alternative trio constituted of the mother (NA12878) and her parents is often also referred to as a CEU Trio. Our trio corresponds to the 2nd generation and one of the 11 grandchildren.

+

We will begin with a bit of data exploration by looking at the following BAM files derived from NA12878:

+
    +
  1. +

    NA12878_wgs_20.bam

    +

    Whole genome sequence (WGS) dataset, paired-end 151 bp reads sequenced on Illumina HiSeqX and fully pre-processed according to the GATK Best Practices for germline DNA.

    +
  2. +
  3. +

    NA12878_rnaseq_20.bam

    +

    RNAseq dataset, paired-end 75 bp reads sequenced on Illumina HiSeqX and aligned using STAR 2-pass according to the GATK Best Practices for RNAseq.

    +
  4. +
  5. +

    NA12878_ICE_20.bam

    +

    Exome dataset, Illumina Capture Exome (ICE) library, paired-end 76 bp reads sequenced on Illumina HiSeqX, fully pre-processed according to the GATK Best Practices for germline DNA.

    +
  6. +
  7. +

    NA12878_NEX_20.bam

    +

    Exome dataset, Illumina Nextera Rapid Capture Exome (NEX) library, paired-end 76 bp reads sequenced on Illumina HiSeqX, fully pre-processed according to the GATK Best Practices for germline DNA.

    +
  8. +
+

The sequence data files have been specially prepared as well to match our custom chromosome 20-only reference. They only contain data on chromosome 20, in two pre-determined intervals of interest ranging from positions 20:10,000,000-10,200,000 and 20:15,800,000-16,100,00 to keep file sizes down.

+

Let’s start by loading the DNA WGS sample of NA12878 (bams/exp_design/NA12878_wgs_20.bam), as shown in the screenshots below.

+ +

Initially you will not see any data displayed. You need to zoom in to a smaller region for IGV to start displaying reads. You can do that by using the -/+ zoom controls, or by typing in some genome regions coordinates. Here, we’ll zoom into a predetermined interval of interest, so type 20:16,029,744-16,030,079 into the coordinates box. Once you hit the [Go] button, you should see something like this:

+ +

The top track shows depth of coverage, i.e. the amount of sequence reads present at each position. The mostly grey horizontal bars filling the viewport are the reads. Grey means that those bases match the reference, while colored stripes or base letters (depending on your zoom level) indicate mismatches. You will also see some reads with mapping insertions and deletions, indicated by purple I symbols and crossed-out gaps, respectively.

+

+
+

TOOL TIP + Read details are shown when you hover over them with your mouse--which can be convenient when troubleshooting, but gets annoying quickly. To turn it off, Click the yellow speech bubble in the toolbar and select “Show details on click”.

+
+

1.3 For comparison: NA12878 Exome Sequence

+

Next, let’s load our two Exome data sets (File>Load from File), NA12878_ICE_20.bam and NA12878_NEX_20.bam, and go to position 20:15,873,697-15,875,416.

+ +

You can see from the coverage graph that the ICE sample has more breadth and depth of coverage at this target site, in comparison to the NEX sample. This directly affects our ability to call variants in the leftmost peak, since ICE provides much more depth and NEX has a particularly lopsided distribution of coverage at that site. That’s not to say that ICE is better in general--just that for this target site, in this sequencing run, it provided more even coverage. The overarching point here is that exome kits are not all equivalent and you should evaluate which kit provides the results you need in the regions you care about, before committing to a particular kit for a whole project. As a corollary, comparing exome datasets generated with different kits can be complicated and requires careful evaluation.

+

1.4 Another comparison: NA12878 RNAseq

+

Lastly, let’s load (File>Load from File) the aligned RNAseq dataset that we have for NA12878 (NA12878_rnaseq_20.bam).

+ + +

You’ll notice pale blue lines to the right of center instead of reads. This is because it’s an intronic region! The blue lines connect to reads that are located in the exon. Click on one to see the N operator in the CIGAR string: in the example here, 32M91225N43M indicates that the read covers a 91225 bp intron.

+
+

2 DIAGNOSING UNKNOWN BAMS

+

2.1 View header and check read groups

+

Now let’s say that you have been brought on to a new project: you will be analyzing sequenced genomes for particular variants in chromosome 20--since you are the chromosome 20 specialist. Your coworker has given you some files that they sequenced a while back. Unfortunately, their lab notebook mostly illegible and lacking in detail where you can read it. So how do you know what’s been done to these files already? Or even if they are good to use still?

+

Enter Samtools. You can use this tool to open up the bam file your coworker gave you, and check the bam’s record log. Open up your terminal and execute the following:

+
samtools view -H bams/exp_design/NA12878_wgs_20.bam | grep ‘@RG’
+

The bam records log information in the header, so we use view -H to ask it to just show us the header. Since we want to see what this sample is, we will also add | grep ‘@RG’, which will only grab the line of the header that starts with @RG.

+
+

@RG ID:H0164.2 PL:illumina PU:H0164ALXX140820.2 LB:Solexa-272222 PI:0 DT:2014-08-20T00:00:00-0400 SM:NA12878 CN:BI

+
+

You can use the read group information to confirm that this file is what your coworker’s notebook scribbles say it is. You can see that it is indeed the NA12878 sample (SM), and the read group ID H0164.2 (ID) matches, etc. After checking that these identifiers match what you can decipher from your coworker’s writing, call Samtools again. This time we will look at @PG to see what tools have been used on this bam file.

+
samtools view -H bams/exp_design/NA12878_wgs_20.bam | grep ‘@PG’
+

Again, this only grabs @PG lines from the header, but you will still get a rather long print out in the terminal; we show a single @PG entry below.

+
+

@PG ID:bwamem PN:bwamem VN:0.7.7-r441 CL:/seq/software/picard/1.750/3rd_party/bwa_mem/bwa mem -M -t 10 -p /ref/b37.fasta /dev/stdin > /dev/stdout

+
+

At the very beginning of each @PG entry, there will be a program ID. From this entry, you can see that BWA MEM was run on the bam file your coworker gave you--the rest of the entry describes the specific parameters that the tool was run with. Scanning through all the entries, you should see that your coworker ran GATK IndelRealigner, GATK PrintReads, MarkDuplicates, and BWA MEM. These tools correlate with the pre-processing steps that your coworker told you they took: mapping with BWA MEM, duplicate marking with MarkDuplicates, indel realignment with IndelRealigner, and lastly, BQSR with PrintReads. + +How does BQSR correspond to PrintReads? Well, PrintReads is the tool used after running BQSR to apply the recalibration to the bam file itself. Since running BaseRecalibrator didn’t modify the bam file, it isn’t recorded in the bam header, but you can infer that it was run because PrintReads shows up in the header.

+

2.2 Validate the file

+

Now satisfied that the file your coworker gave you is properly pre-processed from looking at its header, you want to make sure that the body of the bam file wasn’t broken at some point. We will try diagnosing possible problems in the bam using ValidateSamFile.

+
java -jar picard.jar ValidateSamFile \
+    I=input.bam \
+    MODE=SUMMARY
+

Since we don’t know what kind of errors or warnings we will find, we first run the tool in SUMMARY mode. This will output a histogram listing all the errors and warnings in our file.

+
+

## HISTOGRAM java.lang.String +Error Type Count +ERROR:MATE_NOT_FOUND 77

+
+

That many errors? The file could be badly damaged, but let’s take a closer look. The error here is a MATE_NOT_FOUND, indicating that a read was marked as paired, but that its mate is not found in the file. Now, usually this would be a point of concern, but your coworker told you that this file was subset to a small part of chromosome 20, so it would make sense that some reads mapped within this region and their mates mapped outside the region.

+

We can safely ignore this warning. For more details on errors and warnings that ValidateSamFile can produce (since you won’t just be running your coworker’s samples forever), check out this article. For your coworker’s file, though, you are finally ready to move on to…

+
+

3 VARIANT DISCOVERY

+

3.1 Call variants with a position-based caller: UnifiedGenotyper

+

You found a (typed!) copy of your coworker's variant discovery protocol, so you want to run their bam file following it. It tells you to run the following command:

+
java -jar GenomeAnalysisTK.jar -T UnifiedGenotyper \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_UG_calls.vcf \
+    -glm BOTH \
+    -L 20:10,000,000-10,200,000
+

Reading from the protocol, you see that -glm BOTH tells the tool to call both indels and SNPs, while -L gives the interval that the bam was subset to--no use wasting time trying to run on the whole genome when you only have data for a small amount.

+

When the results return, load the original bam file (bams/exp_design/NA12878_wgs_20.bam) and the output VCF (sandbox/NA12878_wgs_20_UG_calls.vcf) in IGV. Zooming to the coordinates 20:10,002,371-10,002,546, you will see something like the screenshot below.

+ + +

The variant track shows only variant calls--so at this particular site, there is a homozygous SNP call. (You can click on the variant call for more information on it, too.) The bam track below shows the supporting read data that led to a variant call at that site.

+

Since this laptop screen is so tiny (our budget went to reagents rather than monitors…) and we can’t zoom out any more vertically, right-click on the bam track and select “Collapsed” view.

+

This gives us a better overview of what the data looks like in this region: good even coverage, not too much noise in the region, and reasonable allele balance (mostly variant supports the homozygous variant call). Based on the information we see here, this should be a clear variant site.

+

3.2 Call variants with HaplotypeCaller

+

While preparing for this project, though, you recall hearing about another variant caller: HaplotypeCaller. And, looking on GATK’s website, you see that it recommends calling your variants using HaplotypeCaller over the old UnifiedGenotyper. The new algorithm calls both SNP and indel variants simultaneously via local de-novo assembly of haplotypes in an active region. Essentially, when this variant caller finds a region with signs of variation, it tosses out the old alignment information (from BWA MEM) and performs a local realignment of reads in that region. This makes HaplotypeCaller more accurate in regions that are traditionally difficult to call--such as areas that contain different types of variants close together. Position-based callers like UnifiedGenotyper simply can’t compete.

+

You decide to re-run your sample with the new variant caller to see if it makes a difference. Tool documentation on the website gives you a basic command to run, and you add your coworker’s interval trick (-L) in as well.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_calls.vcf \
+    -L 20:10,000,000-10,200,000
+

Load the output VCF (sandbox/NA12878_wgs_20_HC_calls.vcf) in IGV to compare the HC calls to the previously-loaded UG calls.

+ +

We see that HC called the same C/T SNP as UG, but it also called another variant, a homozygous variant insertion of three T bases. How is this possible when so few reads seem to support an insertion at this position?

+
+

TOOL TIP + When you encounter indel-related weirdness, turn on the display of soft-clips, which IGV turns off by default. Go to View > Preferences > Alignments and select “Show soft-clipped bases”

+
+

With soft clip display turned on, the region lights up with variants. This tells us that the aligner (here, BWA MEM) had a lot of trouble mapping reads in the region. It suggests that HaplotypeCaller may have found a different alignment after performing its local graph assembly step. This reassembled region provided HaplotypeCaller with enough support to call the indel that UnifiedGenotyper missed. +

+ +

3.2.1 View realigned reads and assembled haplotypes

+

But we’re not satisfied with “probably” here. Let’s take a peek under the hood of HaplotypeCaller. You find that HaplotypeCaller has a parameter called -bamout, which allows you to ask for the realigned version of the bam. That realigned version is what HaplotypeCaller uses to make its variant calls, so you will be able to see if a realignment fixed the messy region in the original bam.

+

You decide to run the following command:

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_calls_debug.vcf \
+    -bamout sandbox/NA12878_wgs_20.HC_out.bam \
+    -forceActive -disableOptimizations \
+    -L 20:10,002,371-10,002,546 -ip 100
+

Since you are only interested in looking at that messy region, you decide to give the tool a narrowed interval with -L 20:10,002,371-10,002,546, with a 100 bp padding on either side using -ip 100. To make sure the tool does perform the reassembly in that region, you add in the -forceActive and -disableOptimizations arguments.

+

Load the output BAM (sandbox/NA12878_wgs_20.HC_out.bam) in IGV, and switch to Collapsed view once again. You should still be zoomed in on coordinates 20:10,002,371-10,002,546, and have the original bam track loaded for comparison.

+ +

After realignment by HaplotypeCaller (the bottom track), almost all the reads show the insertion, and the messy soft clips from the original bam are gone. Expand the reads in the output BAM (right click>Expanded view), and you can see that all the insertions are in phase with the C/T SNP.

+ +

There is more to a BAM than meets the eye--or at least, what you can see in this view of IGV. Right-click on the reads to bring up the view options menu. Select Color alignments by, and choose read group. Your gray reads should now be colored similar to the screenshot below.

+ +

Some of the first reads, shown in red at the top of the pile, are not real reads. These represent artificial haplotypes that were constructed by HaplotypeCaller, and are tagged with a special read group identifier, “ArtificialHaplotype,” so they can be visualized in IGV. You can click on an artificial read to see this tag under RG.

+

We see that HaplotypeCaller considered six possible haplotypes, because there is more than one variant in the same ActiveRegion. Zoom out further , and we can see that two ActiveRegions were examined within the scope of the interval we provided (with padding). +

+
+

3.2.2 Run more samples

+

You’ve decided that perhaps HaplotypeCaller will work better for your project. However, since you have been working on this protocol update, your coworker found two more samples--they were in a different folder on their computer for reasons you can’t figure out. Regardless, you now need to joint call all the samples together. So, using the same command as before, you’ve tacked on the two additional bam files.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -I bams/trio-calling/NA12877_wgs_20.bam \
+    -I bams/trio-calling/NA12882_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_jointcalls.vcf \
+    -L 20:10,000,000-10,200,000
+

+You notice after entering that, that HaplotypeCaller takes a much longer time to return than other tasks we have run so far. You decide to check the results of this command later, and do some digging on how to make things go faster.

+

3.3 Run HaplotypeCaller on a single bam file in GVCF mode

+

Every time your coworker finds a new folder of samples, you’ll have to re-run all the samples using this increasingly slower HaplotypeCaller command. You’ve also been approved for a grant and intend to send your own samples out for sequencing, so there are those to add in as well. You could just wait until you have all the samples gathered, but that could be a while and your PI wants to see some preliminary results soon. You read about a new GATK workflow that lets you make everyone happy: the GVCF workflow.

+

The first step in variant discovery is to run HaplotypeCaller in GVCF mode on each individual bam file. This is basically running HaplotypeCaller as you did before, but with -ERC GVCF added to the command. You first want to run HaplotypeCaller in GVCF mode on the NA12878 bam. (In the interest of time, we have supplied the other sample GVCFs in the bundle, but normally you would run them individually in the same way as the first.) This will produce a GVCF file that contains genotype likelihoods for each variant position as well as blocks for each interval where no variant is likely. You’ll see what this looks like more in a minute. +

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20.g.vcf \
+    -ERC GVCF \
+    -L 20:10,000,000-10,200,000
+

3.3.1 View resulting GVCF file in the terminal

+

Since a GVCF is a new file type for your workflow, let’s take a look at the actual content first. You can do this in the terminal by typing this command:

+

more sandbox/NA12878_wgs_20.g.vcf

+

As you scroll through the file (hit [ENTER] to scroll, [CTRL]+[C] to exit), note the NON_REF allele defined in the header.

+
+

##ALT=<ID=NON_REF,Description=”Represents any possible alternative allele at this location”>

+
+

Also note the GVCF blocks defined later in the header. The reference (non-variant) blocks are recorded in the GVCF file, in blocks separated by genotype quality.

+
+

##GVCFBlock0-1=minGQ=0(inclusive),maxGQ=1(exclusive) +##GVCFBlock1-2=minGQ=1(inclusive),maxGQ=2(exclusive) +##GVCFBlock10-11=minGQ=10(inclusive),maxGQ=11(exclusive) +##GVCFBlock11-12=minGQ=11(inclusive),maxGQ=12(exclusive)

+
+

Finally, while scrolling through the records, we can see the reference blocks and variant sites.

+
+

20 10000115 . G . . END=10000116 GT:DP:GQ:MIN_DP:PL 0/0:25:69:25:0,69,1035 +20 10000117 . C T, 262.77 . BaseQRankSum=-0.831;ClippingRankSum=-0.092;DP=23;MLEAC=1,0;MLEAF=0.500,0.00;MQ=60.47;MQRankSum=1.446;ReadPosRankSum=0.462 GT:AD:DP:GQ:PL:SB 0/1:11,12,0:23:99:291,0,292,324,327,652:9,2,9,3 +20 10000118 . T . . END=10000123 GT:DP:GQ:MIN_DP:PL 0/0:25:63:24:0,63,945

+
+

Every site in the interval we analyzed is represented here--whether it be by a variant call, a reference call, or a reference block. This helps to distinguish between a “no call” (we don’t have enough data to make a call) and a “reference call” (we have evidence that the site matches the reference).

+

3.3.2 View variants in IGV

+

Now, text in a terminal window can be rather hard to read, so let’s take a look at the GVCFs in IGV. Start a new session to clear your IGV screen, then load the three GVCFs (sandbox/NA12878_wgs_20.g.vcf, gvcfs/NA12877_wgs_20.g.vcf, gvcfs/NA12882_wgs_20.g.vcf). You should already be zoomed in on 20:10,002,371-10,002,546 from our previous section, and see this:

+ +

Notice anything different from the VCF? Along with the colorful variant sites, you see many gray blocks in the GVCF representing the non-variant intervals. Most of the gray blocks are next to each other, but are not grouped together, because they belong to different GQ blocks. The chief difference between the GVCF here and the next step’s VCF is the lack of reference blocks (the gray bits). Only very low-confidence variant sites will be removed in the VCF, based on the QUAL score.

+

3.3.3 Run joint genotyping on the CEU Trio GVCFs to generate the final VCF

+

The last step is to joint call all your GVCF files using the GATK tool GenotypeGVCFs. After looking in the tool documentation, you run this command:

+
java -jar GenomeAnalysisTK.jar -T GenotypeGVCFs \
+    -R ref/human_g1k_b37_20.fasta \
+    -V sandbox/NA12878_wgs_20.g.vcf \
+    -V gvcfs/NA12877_wgs_20.g.vcf \
+    -V gvcfs/NA12882_wgs_20.g.vcf \
+    -o sandbox/CEUTrio_wgs_20_GGVCFs_jointcalls.vcf \
+    -L 20:10,000,000-10,200,000
+

+That returned much faster than the HaplotypeCaller step--and a good thing, too, since this step is the one you’ll need to re-run every time your coworker finds a “new” sample buried in their messy file structure. But does calling this way really give you good results? Let’s take a look.

+

3.3.4 View variants in IGV and compare callsets

+

Load the joint called VCF from normal HaplotypeCaller, section 3.2.1 (sandbox/NA12878_wgs_20_HC_jointcalls.vcf), and GenotypeGVCFs, section 3.3.3 (sandbox/CEUTrio_wgs_20_GGVCFs_jointcalls.vcf). Change your view to look at 20:10,002,584-10,002,665, and you will see:

+ +

At this site, the father NA12877 is heterozygous for a G/T SNP, and the mother, NA12878, and son, NA12882, are homozygous variant for the same SNP. These calls match up, and you figure that the calls between GenotypeGVCFs and HaplotypeCaller, when run in multisample mode, are essentially equivalent. (And if you did some digging, you would find some marginal differences in borderline calls.) However, the GVCF workflow allows you to be more flexible. Every time your PI wants an update on the project, you can simply re-run the quick GenotypeGVCFs step on all the samples you have gathered so far. The expensive and time-consuming part of calculating genotype likelihoods only needs to be done once on each sample, so you won’t have to spend all your grant money on compute to rerun the whole cohort every time you have a new sample.

+

You have successfully run your coworker’s samples, and you’ve found that the most effective workflow for you is the most recent GVCF workflow. Your next step takes you to filtering the callset with either VQSR or hard filters--but you decide to take a break before tackling the next part of the workflow.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md new file mode 100644 index 000000000..58b375fc3 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md @@ -0,0 +1,47 @@ +## (howto) Evaluate a callset with CollectVariantCallingMetrics + +http://gatkforums.broadinstitute.org/gatk/discussion/6186/howto-evaluate-a-callset-with-collectvariantcallingmetrics + +

Related Documents

+ +

Context

+

This document will walk you through use of Picard's CollectVariantCallingMetrics tool, an excellent tool for large callsets, especially if you need your results quickly and do not require many additional metrics to those described here. Your callset consists of variants identified by earlier steps in the GATK best practices pipeline, and now requires additional evaluation to determine where your callset falls on the spectrum of "perfectly identifies all true, biological variants" to "only identifies artifactual or otherwise unreal variants". When variant calling, we want the callset to maximize the correct calls, while minimizing false positive calls. While very robust methods, such as Sanger sequencing, can be used to individually sequence each potential variant, statistical analysis can be used to evaluate callsets instead, saving both time and money. These callset-based analyses are accomplished by comparing relevant metrics between your samples and a known truth set, such as dbSNP. Two tools exist to examine these metrics: VariantEval in GATK, and CollectVariantCallingMetrics in Picard. While the latter is currently used in the Broad Institute's production pipeline, the merits to each tool, as well as the basis for variant evaluation, are discussed here.

+
+

Example Use

+

Command

+
java -jar picard.jar CollectVariantCallingMetrics \
+INPUT=CEUtrio.vcf \
+OUTPUT=CEUtrioMetrics \
+DBSNP=dbsnp_138.b37.excluding_sites_after_129.vcf 
+ +

Getting Results

+

After running the command, CollectVariantCallingMetrics will return both a detail and a summary metrics file. These files can be viewed as a text file if needed, or they can be read in as a table using your preferred spreadsheet viewer (e.g. Excel) or scripting language of your choice (e.g. python, R, etc.) The files contain headers and are tab-delimited; the commands for reading in the tables in RStudio are found below. (Note: Replace "~/path/to/" with the path to your output files as needed.)

+
summary <- read.table("~/path/to/CEUtrioMetrics.variant_calling_summary_metrics", header=TRUE, sep="\t")
+detail <- read.table("~/path/to/CEUtrioMetrics.variant_calling_detail_metrics", header=TRUE, sep="\t")
+ +

Analyzing Results

+ +

*Concatenated in the above table are the detail file's (rows 1-3) and the summary file's (row 4) relevant metrics; for full output table, see attached image file.

+ \ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md new file mode 100644 index 000000000..63758ebf8 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md @@ -0,0 +1,66 @@ +## (howto) Evaluate a callset with VariantEval + +http://gatkforums.broadinstitute.org/gatk/discussion/6211/howto-evaluate-a-callset-with-varianteval + +

Related Documents

+ +

Context

+

This document will walk you through use of GATK's VariantEval tool. VariantEval allows for a lot of customizability, enabling an enhanced analysis of your callset through stratification, use of additional evaluation modules, and the ability to pass in multiple truth sets. Your callset consists of variants identified by earlier steps in the GATK best practices pipeline, and now requires additional evaluation to determine where your callset falls on the spectrum of "perfectly identifies all true, biological variants" to "only identifies artifactual or otherwise unreal variants". When variant calling, we want the callset to maximize the correct calls, while minimizing false positive calls. While very robust methods, such as Sanger sequencing, can be used to individually sequence each potential variant, statistical analysis can be used to evaluate callsets instead, saving both time and money. These callset-based analyses are accomplished by comparing relevant metrics between your samples and a known truth set, such as dbSNP. Two tools exist to examine these metrics: VariantEval in GATK, and CollectVariantCallingMetrics in Picard. While the latter is currently used in the Broad Institute's production pipeline, the merits to each tool, as well as the basis for variant evaluation, are discussed here.

+
+

Example Analysis

+
java -jar GenomeAnalysisTK.jar \
+-T VariantEval \
+-R reference.b37.fasta \
+-eval SampleVariants.vcf \
+-D dbsnp_138.b37.excluding_sites_after_129.vcf \
+-noEV -EV CompOverlap -EV IndelSummary -EV TiTvVariantEvaluator -EV CountVariants -EV MultiallelicSummary \
+-o SampleVariants_Evaluation.eval.grp
+

This command specifies the tool (VariantEval), input files, evaluation modules to be used, and an output file to write the results to. The output will be in the form of a GATKReport.

+

Input Files

+ +

Evaluation Modules

+

For our example command, we will simplify our analysis and examine results using the following minimum standard modules: CompOverlap, IndelSummary, TiTvVariantEvaluator, CountVariants, & MultiallelicSummary. These modules will provide a reasonable assessment of variant qualities while reducing the computational burden in comparison to running the default modules. In the data we ran here, >1500 whole-genome-sequenced samples, this improved the run time by 5 hours and 20 minutes compared to using the default modules, which equates to a 12% time reduction. In order to do this, all default modules are removed with -noEV, then the minimum standard modules are added back in. This tool uses only at variants that have passed all filtration steps to calculate metrics.

+ +

Example Output

+ +

Here we see an example of the table generated by the CompOverlap evaluation module. The field concordantRate is highlighted as it is one of the metrics we examine for quality checks. Each table generated by the sample call is in the attached files list at the end of this document, which you are free to browse at your leisure.

+

It is important to note the stratification by novelty, seen in this and all other tables for this example. The row for "novel" includes all variants that are found in SampleVariants.vcf but not found in the known variants file. By default, your known variants are in dbSNP. However, if you would like to specify a different known set of variants, you can pass in a -comp file, and call -knownName on it. (See the VariantEval tool documentation for more information) The "known" row includes all variants found in SampleVariants.vcf and the known variants file. "All" totals the "known" and "novel" rows. This novelty stratification is done by default, but many other stratifications can be specified; see tool documentation for more information.

+

Compiled in the below table are all of the metrics taken from various tables that we will use to ascertain the quality of the analysis.

+

Metrics Analysis

+ + +
+

Note on speed performance

+

The purpose of running the analysis with the minimum standard evaluation modules is to minimize the time spent running VariantEval. Reducing the number of evaluation modules has some effects on the total runtime; depending on the additional specifications given (stratifications, multiple -comp files, etc.), running with the minimum standard evaluation modules can reduce the runtime by 10-30% in comparison to running the default evaluation modules. Further reducing the runtime can be achieved through multithreading, using the -nt argument.

\ No newline at end of file diff --git "a/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md" "b/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md" new file mode 100644 index 000000000..e285ce778 --- /dev/null +++ "b/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md" @@ -0,0 +1,28 @@ +## (howto) Generate a "bamout file" showing how HaplotypeCaller has remapped sequence reads + +http://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads + +

1. Overview

+

As you may know, HaplotypeCaller performs a local reassembly and realignment of the reads in the region surrounding potential variant sites (see the HaplotypeCaller method docs for more details on why and how this is done). So it often happens that during the calling process, the reads get moved to different mapping positions than what you can observe in the BAM file that you originally provided to HC as input.

+

These remappings usually explain most discordances between calls that are expected based on the original data and actual calls made by HaplotypeCaller, so it's very useful to be able to visualize what rearrangements the tool has made.

+

Please note: The bamout file cannot be generated when using -nt or -nct.

+

2. Generating the bamout for a single site or interval

+

To generate the bamout file for a specific site or interval, just run HaplotypeCaller on the region around the site or interval of interest using the -L argument to restrict the analysis to that region (adding about 500 bp on either side) and using the -bamout argument to specify the name of the bamout file that will be generated.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -o hc_variants.vcf -L 20:10255630-10255840 -bamout bamout.bam
+

If you were using any additional parameters in your original variant calling (including -ERC and related arguments), make sure to include them in this command as well so that you can make an apples-to-apples comparison.

+

Then you open up both the original bam and the bamout file together in a genome browser such as IGV. On some test data from our favorite sample, NA12878, this is what you would see:

+

+

You can see that the bamout file, on top, contains data only for the ActiveRegion that was within the analysis interval specified by -L. The two blue reads represent the artificial haplotypes constructed by HaplotypeCaller (you may need to adjust your IGV settings to see the same thing on your machine).

+

You can see a whole group of reads neatly aligned, with an insertion in the middle. In comparison, the original data shown in the lower track has fewer reads with insertions, but has several reads with mismapped ends. This is a classic example of a site where realignment through reassembly has provided additional evidence for an indel, allowing HaplotypeCaller to call it confidently. In contrast, UnifiedGenotyper was not able to call this insertion confidently.

+

3. Generating the bamout for multiple intervals or the whole genome

+

Although we don't recommend doing this by default because it will cause slower performance and take up a lot of storage space, you can generate a bamout that contains many more intervals, or even covers the whole genome. To do so, just run the same command, but this time, pass your list of intervals to -L, or simply omit it if you want the entire genome to be included.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -o hc_variants.vcf -bamout bamout.bam
+

This time, if you zoom out a bit in IGV, you will see multiple stacks of reads corresponding to the various ActiveRegions that were identified and processed.

+

+

4. Forcing an output in a region that is not covered in the bamout

+

In some cases HaplotypeCaller does not complete processing on an ActiveRegion that it has started. This is typically because there is either almost no evidence of variation once the remapping has been done, or on the contrary, the region is very messy and there is too much complexity. In both cases, the program is designed to give up in order to avoid wasting time. This is a good thing most of the time, but it does mean that sometimes you will have no output in the bamout for the site you are trying to troubleshoot.

+

The good news is that in most cases it is possible to force HaplotypeCaller to go through with the full processing so that it will produce bamout output for your site of interest. To do so, simply add the flags -forceActive and -disableOptimizations to your command line, in addition to the -bamout argument of course.

+
java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -L 20:10371667-10375021 -o hc_forced.vcf -bamout force_bamout.bam -forceActive -disableOptimizations 
+

In this other region, you can see that the original mapping (middle track) was a bit messy with some possible evidence of variation, and in fact UnifiedGenotyper called a SNP in this region (top variant track). But HaplotypeCaller did not call the SNP, and did not output anything in our first bamout file (top read track). When you force an output in that region using the two new flags, you see in the forced bamout (bottom read track) that the remapped data is a lot cleaner and the evidence for variation is essentially gone.

+

+

It is also possible to force an ActiveRegion to be triggered at specific intervals; see the HaplotypeCaller tool docs for more details on how this is done.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md b/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md new file mode 100644 index 000000000..ed5ce6c80 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md @@ -0,0 +1,136 @@ +## (howto) Install all software packages required to follow the GATK Best Practices. + +http://gatkforums.broadinstitute.org/gatk/discussion/2899/howto-install-all-software-packages-required-to-follow-the-gatk-best-practices + +

Objective

+

Install all software packages required to follow the GATK Best Practices.

+

Prerequisites

+

To follow these instructions, you will need to have a basic understanding of the meaning of the following words and command-line operations. If you are unfamiliar with any of the following, you should consult a more experienced colleague or your systems administrator if you have one. There are also many good online tutorials you can use to learn the necessary notions.

+ +

You will also need to have access to an ANSI compliant C++ compiler and the tools needed for normal compilations (make, shell, the standard library, tar, gunzip). These tools are usually pre-installed on Linux/Unix systems. On MacOS X, you may need to install the MacOS Xcode tools. See https://developer.apple.com/xcode/ for relevant information and software downloads. The XCode tools are free but an AppleID may be required to download them.

+

Starting with version 3.6, the GATK requires Java Runtime Environment version 1.8 (Java 8). Previous versions down to 2.6 required JRE 1.7, and earlier versions required 1.6. All Linux/Unix and MacOS X systems should have a JRE pre-installed, but the version may vary. To test your Java version, run the following command in the shell:

+
java -version 
+

This should return a message along the lines of ”java version 1.8.0_25” as well as some details on the Runtime Environment (JRE) and Virtual Machine (VM). If you have a version that does not match the requirements stated above for the version of GATK you are running, the GATK may not run correctly or at all. The simplest solution is to install an additional JRE and specify which you want to use at the command-line. To find out how to do so, you should seek help from your systems administrator.

+

Software packages

+
    +
  1. BWA
  2. +
  3. SAMtools
  4. +
  5. Picard
  6. +
  7. Genome Analysis Toolkit (GATK)
  8. +
  9. IGV
  10. +
  11. RStudio IDE and R libraries ggplot2 and gsalib
  12. +
+

Note that the version numbers of packages you download may be different than shown in the instructions below. If so, please adapt the number accordingly in the commands.

+
+

1. BWA

+

Read the overview of the BWA software on the BWA project homepage, then download the latest version of the software package.

+ +

Unpack the tar file using:

+
tar xvzf bwa-0.7.12.tar.bz2 
+

This will produce a directory called bwa-0.7.12 containing the files necessary to compile the BWA binary. Move to this directory and compile using:

+
cd bwa-0.7.12
+make
+

The compiled binary is called bwa. You should find it within the same folder (bwa-0.7.12 in this example). You may also find other compiled binaries; at time of writing, a second binary called bwamem-lite is also included. You can disregard this file for now. Finally, just add the BWA binary to your path to make it available on the command line. This completes the installation process.

+ +

Open a shell and run:

+
bwa 
+

This should print out some version and author information as well as a list of commands. As the Usage line states, to use BWA you will always build your command lines like this:

+
bwa <command> [options] 
+

This means you first make the call to the binary (bwa), then you specify which command (method) you wish to use (e.g. index) then any options (i.e. arguments such as input files or parameters) used by the program to perform that command.

+
+

2. SAMtools

+

Read the overview of the SAMtools software on the SAMtools project homepage, then download the latest version of the software package.

+ +

Unpack the tar file using:

+
tar xvjf samtools-0.1.2.tar.bz2 
+

This will produce a directory called samtools-0.1.2 containing the files necessary to compile the SAMtools binary. Move to this directory and compile using:

+
cd samtools-0.1.2 
+make 
+

The compiled binary is called samtools. You should find it within the same folder (samtools-0.1.2 in this example). Finally, add the SAMtools binary to your path to make it available on the command line. This completes the installation process.

+ +

Open a shell and run:

+
samtools 
+

This should print out some version information as well as a list of commands. As the Usage line states, to use SAMtools you will always build your command lines like this:

+
samtools <command> [options] 
+

This means you first make the call to the binary (samtools), then you specify which command (method) you wish to use (e.g. index) then any options (i.e. arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.

+
+

3. Picard

+

Read the overview of the Picard software on the Picard project homepage, then download the latest version (currently 2.4.1) of the package containing the pre-compiled program file (the picard-tools-2.x.y.zip file).

+ +

Unpack the zip file using:

+
tar xjf picard-tools-2.4.1.zip 
+

This will produce a directory called picard-tools-2.4.1 containing the Picard jar files. Picard tools are distributed as a pre-compiled Java executable (jar file) so there is no need to compile them.

+

Note that it is not possible to add jar files to your path to make the tools available on the command line; you have to specify the full path to the jar file in your java command, which would look like this:

+
java -jar ~/my_tools/jars/picard.jar <Toolname> [options]
+

This syntax will be explained in a little more detail further below.

+

However, you can set up a shortcut called an "environment variable" in your shell profile configuration to make this easier. The idea is that you create a variable that tells your system where to find a given jar, like this:

+
PICARD = "~/my_tools/jars/picard.jar"
+

So then when you want to run a Picard tool, you just need to call the jar by its shortcut, like this:

+
java -jar $PICARD <Toolname> [options]
+

The exact way to set this up depends on what shell you're using and how your environment is configured. We like this overview and tutorial which explains how it all works; but if you are new to the command line environment and you find this too much too deal with, we recommend asking for help from your institution's IT support group.

+

This completes the installation process.

+ +

Open a shell and run:

+
java -jar picard.jar -h 
+

This should print out some version and usage information about the AddOrReplaceReadGroups.jar tool. At this point you will have noticed an important difference between BWA and Picard tools. To use BWA, we called on the BWA program and specified which of its internal tools we wanted to apply. To use Picard, we called on Java itself as the main program, then specified which jar file to use, knowing that one jar file = one tool. This applies to all Picard tools; to use them you will always build your command lines like this:

+
java -jar picard.jar <ToolName> [options] 
+

This means you first make the call to Java itself as the main program, then specify the picard.jar file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis.

+

Note that the command-line syntax of Picard tools has recently changed from java -jar <ToolName>.jar to java -jar picard.jar <ToolName>. We are using the newer syntax in this document, but some of our other documents may not have been updated yet. If you encounter any documents using the old syntax, let us know and we'll update them accordingly. If you are already using an older version of Picard, either adapt the commands or better, upgrade your version!

+

Next we will see that GATK tools are called in essentially the same way, although the way the options are specified is a little different. The reasons for how tools in a given software package are organized and invoked are largely due to the preferences of the software developers. They generally do not reflect strict technical requirements, although they can have an effect on speed and efficiency.

+
+

4. Genome Analysis Toolkit (GATK)

+

Hopefully if you're reading this, you're already acquainted with the purpose of the GATK, so go ahead and download the latest version of the software package.

+

In order to access the downloads, you need to register for a free account on the GATK support forum. You will also need to read and accept the license agreement before downloading the GATK software package. Note that if you intend to use the GATK for commercial purposes, you will need to purchase a license. See the licensing page for an overview of the commercial licensing conditions.

+ +

Unpack the tar file using:

+
tar xjf GenomeAnalysisTK-3.3-0.tar.bz2 
+

This will produce a directory called GenomeAnalysisTK-3.3-0 containing the GATK jar file, which is called GenomeAnalysisTK.jar, as well as a directory of example files called resources. GATK tools are distributed as a single pre-compiled Java executable so there is no need to compile them. Just like we discussed for Picard, it's not possible to add the GATK to your path, but you can set up a shortcut to the jar file using environment variables as described above.

+

This completes the installation process.

+ +

Open a shell and run:

+
java -jar GenomeAnalysisTK.jar -h 
+

This should print out some version and usage information, as well as a list of the tools included in the GATK. As the Usage line states, to use GATK you will always build your command lines like this:

+
java -jar GenomeAnalysisTK.jar -T <ToolName> [arguments] 
+

This means that just like for Picard, you first make the call to Java itself as the main program, then specify the GenomeAnalysisTK.jar file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis.

+
+

5. IGV

+

The Integrated Genomics Viewer is a genome browser that allows you to view BAM, VCF and other genomic file information in context. It has a graphical user interface that is very easy to use, and can be downloaded for free (though registration is required) from this website. We encourage you to read through IGV's very helpful user guide, which includes many detailed tutorials that will help you use the program most effectively.

+
+

6. RStudio IDE and R libraries ggplot2 and gsalib

+

Download the latest version of RStudio IDE. The webpage should automatically detect what platform you are running on and recommend the version most suitable for your system.

+ +

Follow the installation instructions provided. Binaries are provided for all major platforms; typically they just need to be placed in your Applications (or Programs) directory. Open RStudio and type the following command in the console window:

+
install.packages("ggplot2") 
+

This will download and install the ggplot2 library as well as any other library packages that ggplot2 depends on for its operation. Note that some users have reported having to install two additional package themselves, called reshape and gplots, which you can do as follows:

+
install.packages("reshape")
+install.packages("gplots")
+

Finally, do the same thing to install the gsalib library:

+
install.packages("gsalib")
+

This will download and install the gsalib library.

+

Important note

+

If you are using a recent version of ggplot2 and a version of GATK older than 3.2, you may encounter an error when trying to generate the BQSR or VQSR recalibration plots. This is because until recently our scripts were still using an older version of certain ggplot2 functions. This has been fixed in GATK 3.2, so you should either upgrade your version of GATK (recommended) or downgrade your version of ggplot2. If you experience further issues generating the BQSR recalibration plots, please see this tutorial.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md b/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md new file mode 100644 index 000000000..3c3be18c3 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md @@ -0,0 +1,130 @@ +## (howto) Install software for GATK workshops + +http://gatkforums.broadinstitute.org/gatk/discussion/7098/howto-install-software-for-gatk-workshops + +

Objective

+

Install all software packages required to attend a GATK workshop.

+

Prerequisites

+

To follow these instructions, you will need to have a basic understanding of the meaning of the following words and command-line operations. If you are unfamiliar with any of the following, you should consult a more experienced colleague or your system administrator if you have one. There are also many good online tutorials you can use to learn the necessary notions.

+ +

Platform requirements

+

GATK is supported on all flavors of reasonably recent Linux/Unix and MacOS X systems, but NOT on Windows. The analyses we run in workshops are designed to run quickly and on small datasets, so should not require more than 2G of RAM. For file storage, plan on 10G of space (but I would be shocked if we get to half of that).

+

The current version of GATK requires Java Runtime Environment version 1.8. All Linux/Unix and MacOS X systems should have a JRE pre-installed, but the version may vary. To test your Java version, run the following command in the shell:

+
java -version 
+

This should return a message along the lines of ”java version 1.8.0_65” as well as some details on the Runtime Environment (JRE) and Virtual Machine (VM). If you have a version other than 1.8.x, be aware that you may run into trouble with some of the more advanced features of the Picard and GATK tools. The simplest solution is to install an additional JRE and specify which you want to use at the command-line. To find out how to do so, you should seek help from your system administrator and read this article.

+

Software packages

+
    +
  1. Picard
  2. +
  3. Genome Analysis Toolkit (GATK)
  4. +
  5. IGV
  6. +
  7. RStudio IDE and R libraries ggplot2 and gsalib
  8. +
  9. Samtools
  10. +
  11. RTG Tools
  12. +
+
+

1. Picard

+

Read the overview of the Picard software on the Picard project homepage, then download the latest version (currently 2.4.1) of the package containing the pre-compiled program file (the picard-tools-2.x.y.zip file).

+ +

Unpack the zip file using:

+
tar xjf picard-tools-2.4.1.zip 
+

This will produce a directory called picard-tools-2.4.1 containing the Picard jar files. Picard tools are distributed as a pre-compiled Java executable (jar file) so there is no need to compile them.

+

Note that it is not possible to add jar files to your path to make the tools available on the command line; you have to specify the full path to the jar file in your java command, which would look like this:

+
java -jar ~/my_tools/jars/picard.jar <Toolname> [options]
+

This syntax will be explained in a little more detail further below.

+

However, you can set up a shortcut called an "environment variable" in your shell profile configuration to make this easier. The idea is that you create a variable that tells your system where to find a given jar, like this:

+
PICARD = "~/my_tools/jars/picard.jar"
+

So then when you want to run a Picard tool, you just need to call the jar by its shortcut, like this:

+
java -jar $PICARD <Toolname> [options]
+

The exact way to set this up depends on what shell you're using and how your environment is configured. We like this overview and tutorial which explains how it all works; but if you are new to the command line environment and you find this too much too deal with, we recommend asking for help from your institution's IT support group.

+

This completes the installation process.

+ +

Open a shell and run:

+
java -jar picard.jar -h 
+

This should print out some version and usage information about the AddOrReplaceReadGroups.jar tool. At this point you will have noticed an important difference between BWA and Picard tools. To use BWA, we called on the BWA program and specified which of its internal tools we wanted to apply. To use Picard, we called on Java itself as the main program, then specified which jar file to use, knowing that one jar file = one tool. This applies to all Picard tools; to use them you will always build your command lines like this:

+
java -jar picard.jar <ToolName> [options] 
+

This means you first make the call to Java itself as the main program, then specify the picard.jar file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis.

+

Note that the command-line syntax of Picard tools has recently changed from java -jar <ToolName>.jar to java -jar picard.jar <ToolName>. We are using the newer syntax in this document, but some of our other documents may not have been updated yet. If you encounter any documents using the old syntax, let us know and we'll update them accordingly. If you are already using an older version of Picard, either adapt the commands or better, upgrade your version!

+

Next we will see that GATK tools are called in essentially the same way, although the way the options are specified is a little different. The reasons for how tools in a given software package are organized and invoked are largely due to the preferences of the software developers. They generally do not reflect strict technical requirements, although they can have an effect on speed and efficiency.

+
+

2. Genome Analysis Toolkit (GATK)

+

Hopefully if you're reading this, you're already acquainted with the purpose of the GATK, so go ahead and download the latest version of the software package.

+

In order to access the downloads, you need to register for a free account on the GATK support forum. You will also need to read and accept the license agreement before downloading the GATK software package. Note that if you intend to use the GATK for commercial purposes, you will need to purchase a license. See the licensing page for an overview of the commercial licensing conditions.

+ +

Unpack the tar file using:

+
tar xjf GenomeAnalysisTK-3.6-0.tar.bz2 
+

This will produce a directory called GenomeAnalysisTK-3.6-0 containing the GATK jar file, which is called GenomeAnalysisTK.jar, as well as a directory of example files called resources. GATK tools are distributed as a single pre-compiled Java executable so there is no need to compile them. Just like we discussed for Picard, it's not possible to add the GATK to your path, but you can set up a shortcut to the jar file using environment variables as described above.

+

This completes the installation process.

+ +

Open a shell and run:

+
java -jar GenomeAnalysisTK.jar -h 
+

This should print out some version and usage information, as well as a list of the tools included in the GATK. As the Usage line states, to use GATK you will always build your command lines like this:

+
java -jar GenomeAnalysisTK.jar -T <ToolName> [arguments] 
+

This means that just like for Picard, you first make the call to Java itself as the main program, then specify the GenomeAnalysisTK.jar file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis.

+
+

3. IGV

+

The Integrated Genomics Viewer is a genome browser that allows you to view BAM, VCF and other genomic file information in context. It has a graphical user interface that is very easy to use, and can be downloaded for free (though registration is required) from this website. We encourage you to read through IGV's very helpful user guide, which includes many detailed tutorials that will help you use the program most effectively.

+
+

4. RStudio IDE and R libraries ggplot2 and gsalib

+

Download the latest version of RStudio IDE. The webpage should automatically detect what platform you are running on and recommend the version most suitable for your system.

+ +

Follow the installation instructions provided. Binaries are provided for all major platforms; typically they just need to be placed in your Applications (or Programs) directory. Open RStudio and type the following command in the console window:

+
install.packages("ggplot2") 
+

This will download and install the ggplot2 library as well as any other library packages that ggplot2 depends on for its operation. Note that some users have reported having to install two additional package themselves, called reshape and gplots, which you can do as follows:

+
install.packages("reshape")
+install.packages("gplots")
+

Finally, do the same thing to install the gsalib library:

+
install.packages("gsalib")
+

This will download and install the gsalib library.

+
+

5. SAMtools

+

Read the overview of the SAMtools software on the SAMtools project homepage, then download the latest version of the software package.

+ +

Unpack the tar file using:

+
tar xvjf samtools-0.1.2.tar.bz2 
+

This will produce a directory called samtools-0.1.2 containing the files necessary to compile the SAMtools binary. Move to this directory and compile using:

+
cd samtools-0.1.2 
+make 
+

The compiled binary is called samtools. You should find it within the same folder (samtools-0.1.2 in this example). Finally, add the SAMtools binary to your path to make it available on the command line. This completes the installation process.

+ +

Open a shell and run:

+
samtools 
+

This should print out some version information as well as a list of commands. As the Usage line states, to use SAMtools you will always build your command lines like this:

+
samtools <command> [options] 
+

This means you first make the call to the binary (samtools), then you specify which command (method) you wish to use (e.g. index) then any options (i.e. arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.

+
+

6. RTG Tools

+

RTG Tools is a free open-source software package produced by a commercial company called Real Time Genomics. This toolkit includes some variant evaluation and plotting tools that we find useful for teaching because they're fairly user-friendly and produce neat interactive plots.

+

You can download the toolkit from the RTG website, which provides packages for Linux, MacOS X and Windows.

+ +

After unzipping the file, follow the instructions in the README file that's included in the download package. On a Mac, moving the package to your preferred location and adding the rtg binary to your path to make it available on the command line is sufficient to complete the installation process.

+ +

Open a shell and run:

+
rtg
+

This should print out some usage information as well as a list of commands. As stated, to use the RTG tools you will always build your command lines like this:

+
rtg <command> [options] 
+

This means you first make the call to the binary (rtg), then you specify which command (method) you wish to use (e.g. vcfeval) then any options (i.e. arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.

+

We will use RTG Tools’s modules vcfeval and rocplot. You'll find a PDF file named RTGOperationsManual.pdf containing detailed documentation included in the download packge. For our workshops, the relevant pages are pages 38–42 (for vcfeval) and pages 44–46 (for rocplot).

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md b/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md new file mode 100644 index 000000000..a602b9511 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md @@ -0,0 +1,155 @@ +## (howto) Perform local realignment around indels + +http://gatkforums.broadinstitute.org/gatk/discussion/7156/howto-perform-local-realignment-around-indels + +

+This tutorial replaces Tutorial#2800 and applies to data types within the scope of the GATK Best Practices variant discovery workflow.

+

We provide example data and example commands for performing local realignment around small insertions and deletions (indels) against a reference. The resulting BAM reduces false positive SNPs and represents indels parsimoniously. First we use RealignerTargetCreator to identify and create a target intervals list (step 1). Then we perform local realignment for the target intervals using IndelRealigner (step 2).

+
+

Jump to a section

+
    +
  1. Introduction
  2. +
  3. Create target intervals list using RealignerTargetCreator
  4. +
  5. Realign reads using IndelRealigner
  6. +
  7. Some additional considerations
  8. +
  9. Related resources
  10. +
+
+

+

1. Introduction and tutorial materials

+

Why do indel realignment?

+

Local realignment around indels allows us to correct mapping errors made by genome aligners and make read alignments more consistent in regions that contain indels.

+

Genome aligners can only consider each read independently, and the scoring strategies they use to align reads relative to the reference limit their ability to align reads well in the presence of indels. Depending on the variant event and its relative location within a read, the aligner may favor alignments with mismatches or soft-clips instead of opening a gap in either the read or the reference sequence. In addition, the aligner's scoring scheme may use arbitrary tie-breaking, leading to different, non-parsimonious representations of the event in different reads.

+

In contrast, local realignment considers all reads spanning a given position. This makes it possible to achieve a high-scoring consensus that supports the presence of an indel event. It also produces a more parsimonious representation of the data in the region .

+

This two-step indel realignment process first identifies such regions where alignments may potentially be improved, then realigns the reads in these regions using a consensus model that takes all reads in the alignment context together.

+

Tools involved

+ +

Prerequisites

+ +

Download example data

+ +

back to top

+
+

+

2. Create target intervals list using RealignerTargetCreator

+

For simplicity, we use a single known indels VCF, included in the tutorial data. For recommended resources, see Article#1247.

+

In the command, RealignerTargetCreator takes a coordinate-sorted and indexed BAM and a VCF of known indels and creates a target intervals file.

+
java -jar GenomeAnalysisTK.jar \
+    -T RealignerTargetCreator \
+    -R human_g1k_v37_decoy.fasta \
+    -L 10:96000000-97000000 \
+    -known INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf \
+    -I 7156_snippet.bam \
+    -o 7156_realignertargetcreator.intervals
+

In the resulting file, 7156_realignertargetcreator.intervals, intervals represent sites of extant and potential indels. If sites are proximal, the tool represents them as a larger interval spanning the sites.

+

Comments on specific parameters

+ +

+

The target intervals file

+

The first ten rows of 7156_realignertargetcreator.intervals are as follows. The file is a text-based one-column list with one interval per row in 1-based coordinates. Header and column label are absent. For an interval derived from a known indel, the start position refers to the corresponding known variant. For example, for the first interval, we can zgrep -w 96000399 INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf for details on the 22bp deletion annotated at position 96000399.

+
10:96000399-96000421
+10:96002035-96002036
+10:96002573-96002577
+10:96003556-96003558
+10:96004176-96004177
+10:96005264-96005304
+10:96006455-96006461
+10:96006871-96006872
+10:96007627-96007628
+10:96008204
+

To view intervals on IGV, convert the list to 0-based BED format using the following AWK command. The command saves a new text-based file with .bed extension where chromosome, start and end are tab-separated, and the start position is one less than that in the intervals list.

+
awk -F '[:-]' 'BEGIN { OFS = "\t" } { if( $3 == "") { print $1, $2-1, $2 } else { print $1, $2-1, $3}}' 7156_realignertargetcreator.intervals > 7156_realignertargetcreator.bed
+

back to top

+
+

+

3. Realign reads using IndelRealigner

+

In the following command, IndelRealigner takes a coordinate-sorted and indexed BAM and a target intervals file generated by RealignerTargetCreator. IndelRealigner then performs local realignment on reads coincident with the target intervals using consenses from indels present in the original alignment.

+
java -Xmx8G -Djava.io.tmpdir=/tmp -jar GenomeAnalysisTK.jar \
+    -T IndelRealigner \
+    -R human_g1k_v37_decoy.fasta \
+    -targetIntervals 7156_realignertargetcreator.intervals \
+    -known INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf \ 
+    -I 7156_snippet.bam \
+    -o 7156_snippet_indelrealigner.bam
+

The resulting coordinate-sorted and indexed BAM contains the same records as the original BAM but with changes to realigned records and their mates. Our tutorial's two IGV screenshots show realigned reads in two different loci. For simplicity, the screenshots show the subset of reads that realigned. For screenshots of full alignments for the same loci, see here and here.

+

Comments on specific parameters

+ +

Changes to alignment records

+

For our example data,194 alignment records realign for ~89 sites. These records now have the OC tag to mark the original CIGAR string. We can use the OC tag to pull out realigned reads and instructions for this are in section 4. The following screenshot shows an example pair of records before and after indel realignment. We note seven changes with asterisks, blue for before and red for after, for both the realigned read and for its mate.

+ +

Changes to the example realigned record:

+ +

Changes to the realigned read's mate record:

+ +

back to top

+
+

+

3. Some additional considerations

+

RealignerTargetCreator documentation has a -maxInterval cutoff to drop intervals from the list if they are too large. This is because increases in number of reads per interval quadratically increase the compute required to realign a region, and larger intervals tend to include more reads. By the same reasoning, increasing read depth, e.g. with additional alignment files, increases required compute.

+

Our tutorial's INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf contains 1168 indel-only records. The following are metrics on intervals created using the three available options.

+
               #intervals    avg length     basepair coverage     
+VCF only       1161           3.33           3,864         
+BAM only        487          15.22           7,412          
+VCF+BAM        1151          23.07          26,558         
+

You can project the genomic coverage of the intervals as a function of the interval density (number of intervals per basepair) derived from varying the known indel density (number of indel records in the VCF). This in turn allows you to anticipate compute for indel realignment. The density of indel sites increases the interval length following a power law (y=ax^b). The constant (a) and the power (b) are different for intervals created with VCF only and with VCF+BAM. For our example data, these average interval lengths are well within the length of a read and minimally vary the reads per interval and thus the memory needed for indel realignment.

+

back to top

+
+

+

4. Related resources

+ +

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md b/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md new file mode 100644 index 000000000..c1e3421a6 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md @@ -0,0 +1,75 @@ +## (howto) Recalibrate base quality scores = run BQSR + +http://gatkforums.broadinstitute.org/gatk/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr + +

Objective

+

Recalibrate base quality scores in order to correct sequencing errors and other experimental artifacts.

+

Prerequisites

+ +

Steps

+
    +
  1. Analyze patterns of covariation in the sequence dataset
  2. +
  3. Do a second pass to analyze covariation remaining after recalibration
  4. +
  5. Generate before/after plots
  6. +
  7. Apply the recalibration to your sequence data
  8. +
+
+

1. Analyze patterns of covariation in the sequence dataset

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T BaseRecalibrator \ 
+    -R reference.fa \ 
+    -I input_reads.bam \ 
+    -L 20 \ 
+    -knownSites dbsnp.vcf \ 
+    -knownSites gold_indels.vcf \ 
+    -o recal_data.table 
+

Expected Result

+

This creates a GATKReport file called recal_data.table containing several tables. These tables contain the covariation data that will be used in a later step to recalibrate the base qualities of your sequence data.

+

It is imperative that you provide the program with a set of known sites, otherwise it will refuse to run. The known sites are used to build the covariation model and estimate empirical base qualities. For details on what to do if there are no known sites available for your organism of study, please see the online GATK documentation.

+

Note that -L 20 is used here and in the next steps to restrict analysis to only chromosome 20 in the b37 human genome reference build. To run against a different reference, you may need to change the name of the contig according to the nomenclature used in your reference.

+
+

2. Do a second pass to analyze covariation remaining after recalibration

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T BaseRecalibrator \ 
+    -R reference.fa \ 
+    -I realigned_reads.bam \ 
+    -L 20 \ 
+    -knownSites dbsnp.vcf \ 
+    -knownSites gold_indels.vcf \ 
+    -BQSR recal_data.table \ 
+    -o post_recal_data.table 
+

Expected Result

+

This creates another GATKReport file, which we will use in the next step to generate plots. Note the use of the -BQSR flag, which tells the GATK engine to perform on-the-fly recalibration based on the first recalibration data table.

+
+

3. Generate before/after plots

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T AnalyzeCovariates \ 
+    -R reference.fa \ 
+    -L 20 \ 
+    -before recal_data.table \
+    -after post_recal_data.table \
+    -plots recalibration_plots.pdf
+

Expected Result

+

This generates a document called recalibration_plots.pdf containing plots that show how the reported base qualities match up to the empirical qualities calculated by the BaseRecalibrator. Comparing the before and after plots allows you to check the effect of the base recalibration process before you actually apply the recalibration to your sequence data. For details on how to interpret the base recalibration plots, please see the online GATK documentation.

+
+

4. Apply the recalibration to your sequence data

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T PrintReads \ 
+    -R reference.fa \ 
+    -I input_reads.bam \ 
+    -L 20 \ 
+    -BQSR recal_data.table \ 
+    -o recal_reads.bam 
+

Expected Result

+

This creates a file called recal_reads.bam containing all the original reads, but now with exquisitely accurate base substitution, insertion and deletion quality scores. By default, the original quality scores are discarded in order to keep the file size down. However, you have the option to retain them by adding the flag –emit_original_quals to the PrintReads command, in which case the original qualities will also be written in the file, tagged OQ.

+

Notice how this step uses a very simple tool, PrintReads, to apply the recalibration. What’s happening here is that we are loading in the original sequence data, having the GATK engine recalibrate the base qualities on-the-fly thanks to the -BQSR flag (as explained earlier), and just using PrintReads to write out the resulting data to the new file.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md b/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md new file mode 100644 index 000000000..c380c0701 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md @@ -0,0 +1,252 @@ +## (howto) Recalibrate variant quality scores = run VQSR + +http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr + +

Objective

+

Recalibrate variant quality scores and produce a callset filtered for the desired levels of sensitivity and specificity.

+

Prerequisites

+ +

Caveats

+

This document provides a typical usage example including parameter values. However, the values given may not be representative of the latest Best Practices recommendations. When in doubt, please consult the FAQ document on VQSR training sets and parameters, which overrides this document. See that document also for caveats regarding exome vs. whole genomes analysis design.

+

Steps

+
    +
  1. +

    Prepare recalibration parameters for SNPs
    +a. Specify which call sets the program should use as resources to build the recalibration model
    +b. Specify which annotations the program should use to evaluate the likelihood of Indels being real
    +c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches
    +d. Determine additional model parameters

    +
  2. +
  3. +

    Build the SNP recalibration model

    +
  4. +
  5. +

    Apply the desired level of recalibration to the SNPs in the call set

    +
  6. +
  7. +

    Prepare recalibration parameters for Indels +a. Specify which call sets the program should use as resources to build the recalibration model +b. Specify which annotations the program should use to evaluate the likelihood of Indels being real +c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches +d. Determine additional model parameters

    +
  8. +
  9. +

    Build the Indel recalibration model

    +
  10. +
  11. Apply the desired level of recalibration to the Indels in the call set
  12. +
+
+

1. Prepare recalibration parameters for SNPs

+

a. Specify which call sets the program should use as resources to build the recalibration model

+

For each training set, we use key-value tags to qualify whether the set contains known sites, training sites, and/or truth sites. We also use a tag to specify the prior likelihood that those sites are true (using the Phred scale).

+ +

This resource is a SNP call set that has been validated to a very high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). We will also use these sites later on to choose a threshold for filtering variants based on sensitivity to truth sites. The prior likelihood we assign to these variants is Q15 (96.84%).

+ +

This resource is a set of polymorphic SNP sites produced by the Omni genotyping array. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).

+ +

This resource is a set of high-confidence SNP sites produced by the 1000 Genomes Project. The program will consider that the variants in this resource may contain true variants as well as false positives (truth=false), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q10 (%).

+ +

This resource is a SNP call set that has not been validated to a high degree of confidence (truth=false). The program will not use the variants in this resource to train the recalibration model (training=false). However, the program will use these to stratify output metrics such as Ti/Tv ratio by whether variants are present in dbsnp or not (known=true). The prior likelihood we assign to these variants is Q2 (36.90%).

+

The default prior likelihood assigned to all other variants is Q2 (36.90%). This low value reflects the fact that the philosophy of the GATK callers is to produce a large, highly sensitive callset that needs to be heavily refined through additional filtering.

+

b. Specify which annotations the program should use to evaluate the likelihood of SNPs being real

+

These annotations are included in the information generated for each variant call by the caller. If an annotation is missing (typically because it was omitted from the calling command) it can be added using the VariantAnnotator tool.

+ +

Total (unfiltered) depth of coverage. Note that this statistic should not be used with exome datasets; see caveat detailed in the VQSR arguments FAQ doc.

+ +

Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples.

+ +

Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the StrandOddsRatio (SOR) annotation.

+ +

Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the FisherStrand (FS) annotation.

+ +

The rank sum test for mapping qualities. Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ +

The rank sum test for the distance from the end of the reads. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ +

Estimation of the overall mapping quality of reads supporting a variant call.

+ +

Evidence of inbreeding in a population. See caveats regarding population size and composition detailed in the VQSR arguments FAQ doc.

+

c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches

+ +

Tranches are essentially slices of variants, ranked by VQSLOD, bounded by the threshold values specified in this step. The threshold values themselves refer to the sensitivity we can obtain when we apply them to the call sets that the program uses to train the model. The idea is that the lowest tranche is highly specific but less sensitive (there are very few false positives but potentially many false negatives, i.e. missing calls), and each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. This allows us to filter variants based on how sensitive we want the call set to be, rather than applying hard filters and then only evaluating how sensitive the call set is using post hoc methods.

+
+

2. Build the SNP recalibration model

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T VariantRecalibrator \ 
+    -R reference.fa \ 
+    -input raw_variants.vcf \ 
+    -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap.vcf \ 
+    -resource:omni,known=false,training=true,truth=true,prior=12.0 omni.vcf \ 
+    -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G.vcf \ 
+    -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf \ 
+    -an DP \ 
+    -an QD \ 
+    -an FS \ 
+    -an SOR \ 
+    -an MQ \
+    -an MQRankSum \ 
+    -an ReadPosRankSum \ 
+    -an InbreedingCoeff \
+    -mode SNP \ 
+    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 
+    -recalFile recalibrate_SNP.recal \ 
+    -tranchesFile recalibrate_SNP.tranches \ 
+    -rscriptFile recalibrate_SNP_plots.R 
+

Expected Result

+

This creates several files. The most important file is the recalibration report, called recalibrate_SNP.recal, which contains the recalibration data. This is what the program will use in the next step to generate a VCF file in which the variants are annotated with their recalibrated quality scores. There is also a file called recalibrate_SNP.tranches, which contains the quality score thresholds corresponding to the tranches specified in the original command. Finally, if your installation of R and the other required libraries was done correctly, you will also find some PDF files containing plots. These plots illustrated the distribution of variants according to certain dimensions of the model.

+

For detailed instructions on how to interpret these plots, please refer to the VQSR method documentation and presentation videos.

+
+

3. Apply the desired level of recalibration to the SNPs in the call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T ApplyRecalibration \ 
+    -R reference.fa \ 
+    -input raw_variants.vcf \ 
+    -mode SNP \ 
+    --ts_filter_level 99.0 \ 
+    -recalFile recalibrate_SNP.recal \ 
+    -tranchesFile recalibrate_SNP.tranches \ 
+    -o recalibrated_snps_raw_indels.vcf 
+

Expected Result

+

This creates a new VCF file, called recalibrated_snps_raw_indels.vcf, which contains all the original variants from the original raw_variants.vcf file, but now the SNPs are annotated with their recalibrated quality scores (VQSLOD) and either PASS or FILTER depending on whether or not they are included in the selected tranche.

+

Here we are taking the second lowest of the tranches specified in the original recalibration command. This means that we are applying to our data set the level of sensitivity that would allow us to retrieve 99% of true variants from the truth training sets of HapMap and Omni SNPs. If we wanted to be more specific (and therefore have less risk of including false positives, at the risk of missing real sites) we could take the very lowest tranche, which would only retrieve 90% of the truth training sites. If we wanted to be more sensitive (and therefore less specific, at the risk of including more false positives) we could take the higher tranches. In our Best Practices documentation, we recommend taking the second highest tranche (99.9%) which provides the highest sensitivity you can get while still being acceptably specific.

+
+

4. Prepare recalibration parameters for Indels

+

a. Specify which call sets the program should use as resources to build the recalibration model

+

For each training set, we use key-value tags to qualify whether the set contains known sites, training sites, and/or truth sites. We also use a tag to specify the prior likelihood that those sites are true (using the Phred scale).

+ +

This resource is an Indel call set that has been validated to a high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).

+

The default prior likelihood assigned to all other variants is Q2 (36.90%). This low value reflects the fact that the philosophy of the GATK callers is to produce a large, highly sensitive callset that needs to be heavily refined through additional filtering.

+

b. Specify which annotations the program should use to evaluate the likelihood of Indels being real

+

These annotations are included in the information generated for each variant call by the caller. If an annotation is missing (typically because it was omitted from the calling command) it can be added using the VariantAnnotator tool.

+ +

Total (unfiltered) depth of coverage. Note that this statistic should not be used with exome datasets; see caveat detailed in the VQSR arguments FAQ doc.

+ +

Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples.

+ +

Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the StrandOddsRatio (SOR) annotation.

+ +

Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the FisherStrand (FS) annotation.

+ +

The rank sum test for mapping qualities. Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ +

The rank sum test for the distance from the end of the reads. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ +

Evidence of inbreeding in a population. See caveats regarding population size and composition detailed in the VQSR arguments FAQ doc.

+

c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches

+ +

Tranches are essentially slices of variants, ranked by VQSLOD, bounded by the threshold values specified in this step. The threshold values themselves refer to the sensitivity we can obtain when we apply them to the call sets that the program uses to train the model. The idea is that the lowest tranche is highly specific but less sensitive (there are very few false positives but potentially many false negatives, i.e. missing calls), and each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. This allows us to filter variants based on how sensitive we want the call set to be, rather than applying hard filters and then only evaluating how sensitive the call set is using post hoc methods.

+

d. Determine additional model parameters

+ +

This is the maximum number of Gaussians (i.e. clusters of variants that have similar properties) that the program should try to identify when it runs the variational Bayes algorithm that underlies the machine learning method. In essence, this limits the number of different ”profiles” of variants that the program will try to identify. This number should only be increased for datasets that include very many variants.

+
+

5. Build the Indel recalibration model

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T VariantRecalibrator \ 
+    -R reference.fa \ 
+    -input recalibrated_snps_raw_indels.vcf \ 
+    -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills_and_1000G_gold_standard.indels.b37.sites.vcf \
+    -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+    -an QD \
+    -an DP \ 
+    -an FS \ 
+    -an SOR \ 
+    -an MQRankSum \ 
+    -an ReadPosRankSum \ 
+    -an InbreedingCoeff
+    -mode INDEL \ 
+    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 
+    --maxGaussians 4 \ 
+    -recalFile recalibrate_INDEL.recal \ 
+    -tranchesFile recalibrate_INDEL.tranches \ 
+    -rscriptFile recalibrate_INDEL_plots.R 
+

Expected Result

+

This creates several files. The most important file is the recalibration report, called recalibrate_INDEL.recal, which contains the recalibration data. This is what the program will use in the next step to generate a VCF file in which the variants are annotated with their recalibrated quality scores. There is also a file called recalibrate_INDEL.tranches, which contains the quality score thresholds corresponding to the tranches specified in the original command. Finally, if your installation of R and the other required libraries was done correctly, you will also find some PDF files containing plots. These plots illustrated the distribution of variants according to certain dimensions of the model.

+

For detailed instructions on how to interpret these plots, please refer to the online GATK documentation.

+
+

6. Apply the desired level of recalibration to the Indels in the call set

+

Action

+

Run the following GATK command:

+
java -jar GenomeAnalysisTK.jar \ 
+    -T ApplyRecalibration \ 
+    -R reference.fa \ 
+    -input recalibrated_snps_raw_indels.vcf \ 
+    -mode INDEL \ 
+    --ts_filter_level 99.0 \ 
+    -recalFile recalibrate_INDEL.recal \ 
+    -tranchesFile recalibrate_INDEL.tranches \ 
+    -o recalibrated_variants.vcf 
+

Expected Result

+

This creates a new VCF file, called recalibrated_variants.vcf, which contains all the original variants from the original recalibrated_snps_raw_indels.vcf file, but now the Indels are also annotated with their recalibrated quality scores (VQSLOD) and either PASS or FILTER depending on whether or not they are included in the selected tranche.

+

Here we are taking the second lowest of the tranches specified in the original recalibration command. This means that we are applying to our data set the level of sensitivity that would allow us to retrieve 99% of true variants from the truth training sets of HapMap and Omni SNPs. If we wanted to be more specific (and therefore have less risk of including false positives, at the risk of missing real sites) we could take the very lowest tranche, which would only retrieve 90% of the truth training sites. If we wanted to be more sensitive (and therefore less specific, at the risk of including more false positives) we could take the higher tranches. In our Best Practices documentation, we recommend taking the second highest tranche (99.9%) which provides the highest sensitivity you can get while still being acceptably specific.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md b/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md new file mode 100644 index 000000000..a24d386d9 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md @@ -0,0 +1,47 @@ +## (howto) Revert a BAM file to FastQ format + +http://gatkforums.broadinstitute.org/gatk/discussion/2908/howto-revert-a-bam-file-to-fastq-format + +

NOTE: This tutorial has been replaced by a more recent and much improved version, Tutorial#6484.

+
+

Objective

+

Revert a BAM file back to FastQ. This comes in handy when you receive data that has been processed but not according to GATK Best Practices, and you want to reset and reprocess it properly.

+

Prerequisites

+ +

Steps

+
    +
  1. Shuffle the reads in the bam file
  2. +
  3. Revert the BAM file to FastQ format
  4. +
  5. Compress the FastQ file
  6. +
  7. Note for advanced users
  8. +
+
+

1. Shuffle the reads in the bam file

+

Action

+

Shuffle the reads in the bam file so they are not in a biased order before alignment by running the following HTSlib command:

+
htscmd bamshuf -uOn 128 aln_reads.bam tmp > shuffled_reads.bam 
+

Expected Result

+

This creates a new BAM file containing the original reads, which still retain their mapping information, but now they are no longer sorted.

+

The aligner uses blocks of paired reads to estimate the insert size. If you don’t shuffle your original bam, the blocks of insert size will not be randomly distributed across the genome, rather they will all come from the same region, biasing the insert size calculation. This is a very important step which is unfortunately often overlooked.

+
+

2. Revert the BAM file to FastQ

+

Action

+

Revert the BAM file to FastQ format by running the following HTSlib command:

+
htscmd bam2fq -a shuffled_reads.bam > interleaved_reads.fq 
+

Expected Result

+

This creates an interleaved FastQ file called interleaved_reads.fq containing the now-unmapped paired reads.

+

Interleaved simply means that for each pair of reads in your paired-end data set, both the forward and the reverse reads are in the same file, as opposed to having them in separate files.

+
+

3. Compress the FastQ file

+

Action

+

Compress the FastQ file to reduce its size using the gzip utility:

+
gzip interleaved_reads.fq
+

Expected Result

+

This creates a gzipped FastQ file called interleaved_reads.fq.gz. This file is ready to be used as input for the Best Practices workflow.

+

BWA handles gzipped fastq files natively, so you don’t need to unzip the file to use it later on.

+
+

4. Note for advanced users

+

If you’re feeling adventurous, you can do all of the above with this beautiful one-liner, which will save you a heap of time that the program would otherwise spend performing I/O (loading in and writing out data to/from disk):

+
htscmd bamshuf -uOn 128 aln_reads.bam tmp | htscmd bam2fq -a - | gzip > interleaved_reads.fq.gz 
\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md b/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md new file mode 100644 index 000000000..aeb6e76b8 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md @@ -0,0 +1,90 @@ +## (howto) Run Queue for the first time + +http://gatkforums.broadinstitute.org/gatk/discussion/1288/howto-run-queue-for-the-first-time + +

Objective

+

Run a basic analysis command on example data, parallelized with Queue.

+

Prerequisites

+ +

Steps

+
    +
  1. Set up a dry run of Queue
  2. +
  3. Run the analysis for real
  4. +
  5. Running on a computing farm
  6. +
+
+

1. Set up a dry run of Queue

+

One very cool feature of Queue is that you can test your script by doing a "dry run". That means Queue will prepare the analysis and build the scatter commands, but not actually run them. This makes it easier to check the sanity of your script and command.

+

Here we're going to set up a dry run of a CountReads analysis. You should be familiar with the CountReads walker and the example files from the bundles, as used in the basic "GATK for the first time" tutorial. In addition, we're going to use the example QScript called ExampleCountReads.scala provided in the Queue package download.

+

Action

+

Type the following command:

+
java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam
+

where -S ExampleCountReads.scala specifies which QScript we want to run, -R exampleFASTA.fasta specifies the reference sequence, and -I exampleBAM.bam specifies the file of aligned reads we want to analyze.

+

Expected Result

+

After a few seconds you should see output that looks nearly identical to this:

+
INFO  00:30:45,527 QScriptManager - Compiling 1 QScript 
+INFO  00:30:52,869 QScriptManager - Compilation complete 
+INFO  00:30:53,284 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,284 HelpFormatter - Queue v2.0-36-gf5c1c1a, Compiled 2012/08/08 20:18:21 
+INFO  00:30:53,284 HelpFormatter - Copyright (c) 2012 The Broad Institute 
+INFO  00:30:53,284 HelpFormatter - Fro support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  00:30:53,285 HelpFormatter - Program Args: -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  00:30:53,285 HelpFormatter - Date/Time: 2012/08/09 00:30:53 
+INFO  00:30:53,285 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,285 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,290 QCommandLine - Scripting ExampleCountReads 
+INFO  00:30:53,364 QCommandLine - Added 1 functions 
+INFO  00:30:53,364 QGraph - Generating graph. 
+INFO  00:30:53,388 QGraph - ------- 
+INFO  00:30:53,402 QGraph - Pending:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:30:53,403 QGraph - Log:     /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads-1.out 
+INFO  00:30:53,403 QGraph - Dry run completed successfully! 
+INFO  00:30:53,404 QGraph - Re-run with "-run" to execute the functions. 
+INFO  00:30:53,409 QCommandLine - Script completed successfully with 1 total jobs 
+INFO  00:30:53,410 QCommandLine - Writing JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.txt 
+

If you don't see this, check your spelling (GATK commands are case-sensitive), check that the files are in your working directory, and if necessary, re-check that the GATK and Queue are properly installed.

+

If you do see this output, congratulations! You just successfully ran you first Queue dry run!

+
+

2. Run the analysis for real

+

Once you have verified that the Queue functions have been generated successfully, you can execute the pipeline by appending -run to the command line.

+

Action

+

Instead of this command, which we used earlier:

+
java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam
+

this time you type this:

+
java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam -run
+

See the difference?

+

Result

+

You should see output that looks nearly identical to this:

+
INFO  00:56:33,688 QScriptManager - Compiling 1 QScript 
+INFO  00:56:39,327 QScriptManager - Compilation complete 
+INFO  00:56:39,487 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,487 HelpFormatter - Queue v2.0-36-gf5c1c1a, Compiled 2012/08/08 20:18:21 
+INFO  00:56:39,488 HelpFormatter - Copyright (c) 2012 The Broad Institute 
+INFO  00:56:39,488 HelpFormatter - Fro support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  00:56:39,489 HelpFormatter - Program Args: -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam -run 
+INFO  00:56:39,490 HelpFormatter - Date/Time: 2012/08/09 00:56:39 
+INFO  00:56:39,490 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,491 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,498 QCommandLine - Scripting ExampleCountReads 
+INFO  00:56:39,569 QCommandLine - Added 1 functions 
+INFO  00:56:39,569 QGraph - Generating graph. 
+INFO  00:56:39,589 QGraph - Running jobs. 
+INFO  00:56:39,623 FunctionEdge - Starting:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:56:39,623 FunctionEdge - Output written to /Users/GG/codespace/GATK/Q2/resources/ExampleCountReads-1.out 
+INFO  00:56:50,301 QGraph - 0 Pend, 1 Run, 0 Fail, 0 Done 
+INFO  00:57:09,827 FunctionEdge - Done:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/resources/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:57:09,828 QGraph - 0 Pend, 0 Run, 0 Fail, 1 Done 
+INFO  00:57:09,835 QCommandLine - Script completed successfully with 1 total jobs 
+INFO  00:57:09,835 QCommandLine - Writing JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.txt 
+INFO  00:57:10,107 QCommandLine - Plotting JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.pdf 
+WARN  00:57:18,597 RScriptExecutor - RScript exited with 1. Run with -l DEBUG for more info. 
+

Great! It works!

+

The results of the traversal will be written to a file in the current directory. The name of the file will be printed in the output, ExampleCountReads.out in this example.

+

If for some reason the run was interrupted, in most cases you can resume by just launching the command. Queue will pick up where it left off without redoing the parts that ran successfully.

+
+

3. Running on a computing farm

+

Run with -bsub to run on LSF, or for early Grid Engine support see Queue with Grid Engine.

+

See also QFunction and Command Line Options for more info on Queue options.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md b/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md new file mode 100644 index 000000000..e54515fbd --- /dev/null +++ b/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md @@ -0,0 +1,165 @@ +## (howto) Run the GATK for the first time + +http://gatkforums.broadinstitute.org/gatk/discussion/1209/howto-run-the-gatk-for-the-first-time + +

NOTICE:

+

This tutorial is slightly out of date so the output is a little different. We'll update this soon, but in the meantime, don't freak out if you get a result that reads something like

+
INFO 18:32:38,826 CountReads - CountReads counted 33 reads in the traversal 
+

instead of

+
INFO  16:17:46,061 Walker - [REDUCE RESULT] Traversal result is: 33 
+

You're doing the right thing and getting the right result.

+

And of course, in doubt, just post a comment on this article; we're here to answer your questions.

+
+

Objective

+

Run a basic analysis command on example data.

+

Prerequisites

+ +

Steps

+
    +
  1. Invoke the GATK CountReads command
  2. +
  3. Further exercises
  4. +
+
+

1. Invoke the GATK CountReads command

+

A very simple analysis that you can do with the GATK is getting a count of the reads in a BAM file. The GATK is capable of much more powerful analyses, but this is a good starting example because there are very few things that can go wrong.

+

So we are going to count the reads in the file exampleBAM.bam, which you can find in the GATK resource bundle along with its associated index (same file name with .bai extension), as well as the example reference exampleFASTA.fasta and its associated index (same file name with .fai extension) and dictionary (same file name with .dict extension). Copy them to your working directory so that your directory contents look like this:

+
[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% ls -la
+drwxr-xr-x  9 vdauwera  CHARLES\Domain Users     306 Jul 25 16:29 .
+drwxr-xr-x@ 6 vdauwera  CHARLES\Domain Users     204 Jul 25 15:31 ..
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users    3635 Apr 10 07:39 exampleBAM.bam
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     232 Apr 10 07:39 exampleBAM.bam.bai
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     148 Apr 10 07:39 exampleFASTA.dict
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users  101673 Apr 10 07:39 exampleFASTA.fasta
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users      20 Apr 10 07:39 exampleFASTA.fasta.fai
+

Action

+

Type the following command:

+
java -jar <path to GenomeAnalysisTK.jar> -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam 
+

where -T CountReads specifies which analysis tool we want to use, -R exampleFASTA.fasta specifies the reference sequence, and -I exampleBAM.bam specifies the file of aligned reads we want to analyze.

+

For any analysis that you want to run on a set of aligned reads, you will always need to use at least these three arguments:

+ +

They don't have to be in that order in your command, but this way you can remember that you need them if you TRI...

+

Expected Result

+

After a few seconds you should see output that looks like to this:

+
INFO  16:17:45,945 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,946 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:17:45,947 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:17:45,947 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:17:45,947 HelpFormatter - Program Args: -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  16:17:45,947 HelpFormatter - Date/Time: 2012/07/25 16:17:45 
+INFO  16:17:45,947 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,948 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,950 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:17:45,982 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:17:45,993 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:17:46,060 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:17:46,060 TraversalEngine -        Location processed.reads  runtime per.1M.reads completed total.runtime remaining 
+INFO  16:17:46,061 Walker - [REDUCE RESULT] Traversal result is: 33 
+INFO  16:17:46,061 TraversalEngine - Total runtime 0.00 secs, 0.00 min, 0.00 hours 
+INFO  16:17:46,100 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:17:46,729 GATKRunReport - Uploaded run statistics report to AWS S3 
+

Depending on the GATK release, you may see slightly different information output, but you know everything is running correctly if you see the line:

+
INFO  21:53:04,556 Walker - [REDUCE RESULT] Traversal result is: 33 
+

somewhere in your output.

+

If you don't see this, check your spelling (GATK commands are case-sensitive), check that the files are in your working directory, and if necessary, re-check that the GATK is properly installed.

+

If you do see this output, congratulations! You just successfully ran you first GATK analysis!

+

Basically the output you see means that the CountReadsWalker (which you invoked with the command line option -T CountReads) counted 33 reads in the exampleBAM.bam file, which is exactly what we expect to see.

+

Wait, what is this walker thing?

+

In the GATK jargon, we call the tools walkers because the way they work is that they walk through the dataset --either along the reference sequence (LocusWalkers), or down the list of reads in the BAM file (ReadWalkers)-- collecting the requested information along the way.

+
+

2. Further Exercises

+

Now that you're rocking the read counts, you can start to expand your use of the GATK command line.

+

Let's say you don't care about counting reads anymore; now you want to know the number of loci (positions on the genome) that are covered by one or more reads. The name of the tool, or walker, that does this is CountLoci. Since the structure of the GATK command is basically always the same, you can simply switch the tool name, right?

+

Action

+

Instead of this command, which we used earlier:

+
java -jar <path to GenomeAnalysisTK.jar> -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam 
+

this time you type this:

+
java -jar <path to GenomeAnalysisTK.jar> -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam 
+

See the difference?

+

Result

+

You should see something like this output:

+
INFO  16:18:26,183 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,185 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:18:26,185 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:18:26,185 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:18:26,186 HelpFormatter - Program Args: -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  16:18:26,186 HelpFormatter - Date/Time: 2012/07/25 16:18:26 
+INFO  16:18:26,186 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,186 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,189 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:18:26,222 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:18:26,233 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:18:26,351 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:18:26,351 TraversalEngine -        Location processed.sites  runtime per.1M.sites completed total.runtime remaining 
+2052
+INFO  16:18:26,411 TraversalEngine - Total runtime 0.08 secs, 0.00 min, 0.00 hours 
+INFO  16:18:26,450 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:18:27,124 GATKRunReport - Uploaded run statistics report to AWS S3 
+

Great! But wait -- where's the result? Last time the result was given on this line:

+
INFO  21:53:04,556 Walker - [REDUCE RESULT] Traversal result is: 33 
+

But this time there is no line that says [REDUCE RESULT]! Is something wrong?

+

Not really. The program ran just fine -- but we forgot to give it an output file name. You see, the CountLoci walker is set up to output the result of its calculations to a text file, unlike CountReads, which is perfectly happy to output its result to the terminal screen.

+

Action

+

So we repeat the command, but this time we specify an output file, like this:

+
java -jar <path to GenomeAnalysisTK.jar> -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam -o output.txt
+

where -o (lowercase o, not zero) is used to specify the output.

+

Result

+

You should get essentially the same output on the terminal screen as previously (but notice the difference in the line that contains Program Args -- the new argument is included):

+
INFO  16:29:15,451 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,453 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:29:15,453 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:29:15,453 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:29:15,453 HelpFormatter - Program Args: -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam -o output.txt 
+INFO  16:29:15,454 HelpFormatter - Date/Time: 2012/07/25 16:29:15 
+INFO  16:29:15,454 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,454 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,457 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:29:15,488 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:29:15,499 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:29:15,618 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:29:15,618 TraversalEngine -        Location processed.sites  runtime per.1M.sites completed total.runtime remaining 
+INFO  16:29:15,679 TraversalEngine - Total runtime 0.08 secs, 0.00 min, 0.00 hours 
+INFO  16:29:15,718 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:29:16,712 GATKRunReport - Uploaded run statistics report to AWS S3 
+

This time however, if we look inside the working directory, there is a newly created file there called output.txt.

+
[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% ls -la
+drwxr-xr-x  9 vdauwera  CHARLES\Domain Users     306 Jul 25 16:29 .
+drwxr-xr-x@ 6 vdauwera  CHARLES\Domain Users     204 Jul 25 15:31 ..
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users    3635 Apr 10 07:39 exampleBAM.bam
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     232 Apr 10 07:39 exampleBAM.bam.bai
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     148 Apr 10 07:39 exampleFASTA.dict
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users  101673 Apr 10 07:39 exampleFASTA.fasta
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users      20 Apr 10 07:39 exampleFASTA.fasta.fai
+-rw-r--r--  1 vdauwera  CHARLES\Domain Users       5 Jul 25 16:29 output.txt
+

This file contains the result of the analysis:

+
[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% cat output.txt 
+2052
+

This means that there are 2052 loci in the reference sequence that are covered by at least one or more reads in the BAM file.

+

Discussion

+

Okay then, but why not show the full, correct command in the first place? Because this was a good opportunity for you to learn a few of the caveats of the GATK command system, which may save you a lot of frustration later on.

+

Beyond the common basic arguments that almost all GATK walkers require, most of them also have specific requirements or options that are important to how they work. You should always check what are the specific arguments that are required, recommended and/or optional for the walker you want to use before starting an analysis.

+

Fortunately the GATK is set up to complain (i.e. terminate with an error message) if you try to run it without specifying a required argument. For example, if you try to run this:

+
java -jar <path to GenomeAnalysisTK.jar> -T CountLoci -R exampleFASTA.fasta
+

the GATK will spit out a wall of text, including the basic usage guide that you can invoke with the --help option, and more importantly, the following error message:

+
##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR A USER ERROR has occurred (version 2.0-22-g40f97eb): 
+##### ERROR The invalid arguments or inputs must be corrected before the GATK can proceed
+##### ERROR Please do not post this error to the GATK forum
+##### ERROR
+##### ERROR See the documentation (rerun with -h) for this tool to view allowable command-line arguments.
+##### ERROR Visit our website and forum for extensive documentation and answers to 
+##### ERROR commonly asked questions http://www.broadinstitute.org/gatk
+##### ERROR
+##### ERROR MESSAGE: Walker requires reads but none were provided.
+##### ERROR ------------------------------------------------------------------------------------------
+

You see the line that says ERROR MESSAGE: Walker requires reads but none were provided? This tells you exactly what was wrong with your command.

+

So the GATK will not run if a walker does not have all the required inputs. That's a good thing! But in the case of our first attempt at running CountLoci, the -o argument is not required by the GATK to run -- it's just highly desirable if you actually want the result of the analysis!

+

There will be many other cases of walkers with arguments that are not strictly required, but highly desirable if you want the results to be meaningful.

+

So, at the risk of getting repetitive, always read the documentation of each walker that you want to use!

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md b/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md new file mode 100644 index 000000000..9cb7f6c59 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md @@ -0,0 +1,22 @@ +## (howto) Run the genotype refinement workflow + +http://gatkforums.broadinstitute.org/gatk/discussion/4727/howto-run-the-genotype-refinement-workflow + +

Overview

+

This tutorial describes step-by-step instruction for applying the Genotype Refinement workflow (described in this method article) to your data.

+
+

Step 1: Derive posterior probabilities of genotypes

+

In this first step, we are deriving the posteriors of genotype calls in our callset, recalibratedVariants.vcf, which just came out of the VQSR filtering step; it contains among other samples a trio of individuals (mother, father and child) whose family structure is described in the pedigree file trio.ped (which you need to supply). To do this, we are using the most comprehensive set of high confidence SNPs available to us, a set of sites from Phase 3 of the 1000 Genomes project (available in our resource bundle), which we pass via the --supporting argument.

+
 java -jar GenomeAnalysisToolkit.jar -R human_g1k_v37_decoy.fasta -T CalculateGenotypePosteriors --supporting 1000G_phase3_v4_20130502.sites.vcf -ped trio.ped -V recalibratedVariants.vcf -o recalibratedVariants.postCGP.vcf
+

This produces the output file recalibratedVariants.postCGP.vcf, in which the posteriors have been annotated wherever possible.

+
+

Step 2: Filter low quality genotypes

+

In this second, very simple step, we are tagging low quality genotypes so we know not to use them in our downstream analyses. We use Q20 as threshold for quality, which means that any passing genotype has a 99% chance of being correct.

+
java -jar $GATKjar -T VariantFiltration -R $bundlePath/b37/human_g1k_v37_decoy.fasta -V recalibratedVariants.postCGP.vcf -G_filter "GQ < 20.0" -G_filterName lowGQ -o recalibratedVariants.postCGP.Gfiltered.vcf
+

Note that in the resulting VCF, the genotypes that failed the filter are still present, but they are tagged lowGQ with the FT tag of the FORMAT field.

+
+

Step 3: Annotate possible de novo mutations

+

In this third and final step, we tag variants for which at least one family in the callset shows evidence of a de novo mutation based on the genotypes of the family members.

+
java -jar $GATKjar -T VariantAnnotator -R $bundlePath/b37/human_g1k_v37_decoy.fasta -V recalibratedVariants.postCGP.Gfiltered.vcf -A PossibleDeNovo -ped trio.ped -o recalibratedVariants.postCGP.Gfiltered.deNovos.vcf
+

The annotation output will include a list of the children with possible de novo mutations, classified as either high or low confidence.

+

See section 3 of the method article for a complete description of annotation outputs and section 4 for an example of a call and the interpretation of the annotation values.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md b/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md new file mode 100644 index 000000000..332f0e8aa --- /dev/null +++ b/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md @@ -0,0 +1,71 @@ +## (howto) Test your GATK installation + +http://gatkforums.broadinstitute.org/gatk/discussion/1200/howto-test-your-gatk-installation + +

Objective

+

Test that the GATK is correctly installed, and that the supporting tools like Java are in your path.

+

Prerequisites

+ +

Steps

+
    +
  1. Invoke the GATK usage/help message
  2. +
  3. Troubleshooting
  4. +
+
+

1. Invoke the GATK usage/help message

+

The command we're going to run is a very simple command that asks the GATK to print out a list of available command-line arguments and options. It is so simple that it will ALWAYS work if your GATK package is installed correctly.

+

Note that this command is also helpful when you're trying to remember something like the right spelling or short name for an argument and for whatever reason you don't have access to the web-based documentation.

+

Action

+

Type the following command:

+
java -jar <path to GenomeAnalysisTK.jar> --help
+

replacing the <path to GenomeAnalysisTK.jar> bit with the path you have set up in your command-line environment.

+

Expected Result

+

You should see usage output similar to the following:

+
usage: java -jar GenomeAnalysisTK.jar -T <analysis_type> [-I <input_file>] [-L 
+        <intervals>] [-R <reference_sequence>] [-B <rodBind>] [-D <DBSNP>] [-H 
+        <hapmap>] [-hc <hapmap_chip>] [-o <out>] [-e <err>] [-oe <outerr>] [-A] [-M 
+        <maximum_reads>] [-sort <sort_on_the_fly>] [-compress <bam_compression>] [-fmq0] [-dfrac 
+        <downsample_to_fraction>] [-dcov <downsample_to_coverage>] [-S 
+        <validation_strictness>] [-U] [-P] [-dt] [-tblw] [-nt <numthreads>] [-l 
+        <logging_level>] [-log <log_to_file>] [-quiet] [-debug] [-h]
+-T,--analysis_type <analysis_type>                     Type of analysis to run
+-I,--input_file <input_file>                           SAM or BAM file(s)
+-L,--intervals <intervals>                             A list of genomic intervals over which 
+                                                       to operate. Can be explicitly specified 
+                                                       on the command line or in a file.
+-R,--reference_sequence <reference_sequence>           Reference sequence file
+-B,--rodBind <rodBind>                                 Bindings for reference-ordered data, in 
+                                                       the form <name>,<type>,<file>
+-D,--DBSNP <DBSNP>                                     DBSNP file
+-H,--hapmap <hapmap>                                   Hapmap file
+-hc,--hapmap_chip <hapmap_chip>                        Hapmap chip file
+-o,--out <out>                                         An output file presented to the walker. 
+                                                       Will overwrite contents if file exists.
+-e,--err <err>                                         An error output file presented to the 
+                                                       walker. Will overwrite contents if file 
+                                                       exists.
+-oe,--outerr <outerr>                                  A joint file for 'normal' and error 
+                                                       output presented to the walker. Will 
+                                                       overwrite contents if file exists.
+
+...
+

If you see this message, your GATK installation is ok. You're good to go! If you don't see this message, and instead get an error message, proceed to the next section on troubleshooting.

+
+

2. Troubleshooting

+

Let's try to figure out what's not working.

+

Action

+

First, make sure that your Java version is at least 1.7, by typing the following command:

+
java -version
+

Expected Result

+

You should see something similar to the following text:

+
java version "1.7.0_12"
+Java(TM) SE Runtime Environment (build 1.7.0_12-b04)
+Java HotSpot(TM) 64-Bit Server VM (build 11.2-b01, mixed mode)  
+

Remedial actions

+

If the version is less then 1.7, install the newest version of Java onto the system. If you instead see something like

+
java: Command not found  
+

make sure that java is installed on your machine, and that your PATH variable contains the path to the java executables.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md b/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md new file mode 100644 index 000000000..48ad60cba --- /dev/null +++ b/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md @@ -0,0 +1,100 @@ +## (howto) Test your Queue installation + +http://gatkforums.broadinstitute.org/gatk/discussion/1287/howto-test-your-queue-installation + +

Objective

+

Test that Queue is correctly installed, and that the supporting tools like Java are in your path.

+

Prerequisites

+ +

Steps

+
    +
  1. Invoke the Queue usage/help message
  2. +
  3. Troubleshooting
  4. +
+
+

1. Invoke the Queue usage/help message

+

The command we're going to run is a very simple command that asks Queue to print out a list of available command-line arguments and options. It is so simple that it will ALWAYS work if your Queue package is installed correctly.

+

Note that this command is also helpful when you're trying to remember something like the right spelling or short name for an argument and for whatever reason you don't have access to the web-based documentation.

+

Action

+

Type the following command:

+
java -jar <path to Queue.jar> --help
+

replacing the <path to Queue.jar> bit with the path you have set up in your command-line environment.

+

Expected Result

+

You should see usage output similar to the following:

+
usage: java -jar Queue.jar -S <script> [-jobPrefix <job_name_prefix>] [-jobQueue <job_queue>] [-jobProject <job_project>]
+       [-jobSGDir <job_scatter_gather_directory>] [-memLimit <default_memory_limit>] [-runDir <run_directory>] [-tempDir
+       <temp_directory>] [-emailHost <emailSmtpHost>] [-emailPort <emailSmtpPort>] [-emailTLS] [-emailSSL] [-emailUser
+       <emailUsername>] [-emailPass <emailPassword>] [-emailPassFile <emailPasswordFile>] [-bsub] [-run] [-dot <dot_graph>]
+       [-expandedDot <expanded_dot_graph>] [-startFromScratch] [-status] [-statusFrom <status_email_from>] [-statusTo
+       <status_email_to>] [-keepIntermediates] [-retry <retry_failed>] [-l <logging_level>] [-log <log_to_file>] [-quiet]
+       [-debug] [-h]
+
+ -S,--script <script>                                                      QScript scala file
+ -jobPrefix,--job_name_prefix <job_name_prefix>                            Default name prefix for compute farm jobs.
+ -jobQueue,--job_queue <job_queue>                                         Default queue for compute farm jobs.
+ -jobProject,--job_project <job_project>                                   Default project for compute farm jobs.
+ -jobSGDir,--job_scatter_gather_directory <job_scatter_gather_directory>   Default directory to place scatter gather
+                                                                           output for compute farm jobs.
+ -memLimit,--default_memory_limit <default_memory_limit>                   Default memory limit for jobs, in gigabytes.
+ -runDir,--run_directory <run_directory>                                   Root directory to run functions from.
+ -tempDir,--temp_directory <temp_directory>                                Temp directory to pass to functions.
+ -emailHost,--emailSmtpHost <emailSmtpHost>                                Email SMTP host. Defaults to localhost.
+ -emailPort,--emailSmtpPort <emailSmtpPort>                                Email SMTP port. Defaults to 465 for ssl,
+                                                                           otherwise 25.
+ -emailTLS,--emailUseTLS                                                   Email should use TLS. Defaults to false.
+ -emailSSL,--emailUseSSL                                                   Email should use SSL. Defaults to false.
+ -emailUser,--emailUsername <emailUsername>                                Email SMTP username. Defaults to none.
+ -emailPass,--emailPassword <emailPassword>                                Email SMTP password. Defaults to none. Not
+                                                                           secure! See emailPassFile.
+ -emailPassFile,--emailPasswordFile <emailPasswordFile>                    Email SMTP password file. Defaults to none.
+ -bsub,--bsub_all_jobs                                                     Use bsub to submit jobs
+ -run,--run_scripts                                                        Run QScripts.  Without this flag set only
+                                                                           performs a dry run.
+ -dot,--dot_graph <dot_graph>                                              Outputs the queue graph to a .dot file.  See:
+                                                                           http://en.wikipedia.org/wiki/DOT_language
+ -expandedDot,--expanded_dot_graph <expanded_dot_graph>                    Outputs the queue graph of scatter gather to
+                                                                           a .dot file.  Otherwise overwrites the
+                                                                           dot_graph
+ -startFromScratch,--start_from_scratch                                    Runs all command line functions even if the
+                                                                           outputs were previously output successfully.
+ -status,--status                                                          Get status of jobs for the qscript
+ -statusFrom,--status_email_from <status_email_from>                       Email address to send emails from upon
+                                                                           completion or on error.
+ -statusTo,--status_email_to <status_email_to>                             Email address to send emails to upon
+                                                                           completion or on error.
+ -keepIntermediates,--keep_intermediate_outputs                            After a successful run keep the outputs of
+                                                                           any Function marked as intermediate.
+ -retry,--retry_failed <retry_failed>                                      Retry the specified number of times after a
+                                                                           command fails.  Defaults to no retries.
+ -l,--logging_level <logging_level>                                        Set the minimum level of logging, i.e.
+                                                                           setting INFO get's you INFO up to FATAL,
+                                                                           setting ERROR gets you ERROR and FATAL level
+                                                                           logging.
+ -log,--log_to_file <log_to_file>                                          Set the logging location
+ -quiet,--quiet_output_mode                                                Set the logging to quiet mode, no output to
+                                                                           stdout
+ -debug,--debug_mode                                                       Set the logging file string to include a lot
+                                                                           of debugging information (SLOW!)
+ -h,--help                                                                 Generate this help message
+

If you see this message, your Queue installation is ok. You're good to go! If you don't see this message, and instead get an error message, proceed to the next section on troubleshooting.

+
+

2. Troubleshooting

+

Let's try to figure out what's not working.

+

Action

+

First, make sure that your Java version is at least 1.6, by typing the following command:

+
java -version
+

Expected Result

+

You should see something similar to the following text:

+
java version "1.6.0_12"
+Java(TM) SE Runtime Environment (build 1.6.0_12-b04)
+Java HotSpot(TM) 64-Bit Server VM (build 11.2-b01, mixed mode)  
+

Remedial actions

+

If the version is less then 1.6, install the newest version of Java onto the system. If you instead see something like

+
java: Command not found  
+

make sure that java is installed on your machine, and that your PATH variable contains the path to the java executables.

+

On a Mac running OS X 10.5+, you may need to run /Applications/Utilities/Java Preferences.app and drag Java SE 6 to the top to make your machine run version 1.6, even if it has been installed.

\ No newline at end of file diff --git a/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md b/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md new file mode 100644 index 000000000..1cfb9b558 --- /dev/null +++ b/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md @@ -0,0 +1,61 @@ +## (howto) Visualize an alignment with IGV + +http://gatkforums.broadinstitute.org/gatk/discussion/6491/howto-visualize-an-alignment-with-igv + +

+ +

Visualize sequence read alignment data (BAM or SAM) on IGV using this quick-start tutorial. The Integrative Genomics Viewer is a non-GATK tool developed at the Broad Institute that allows for interactive exploration of large genomic datasets.

+

Tools involved

+ +

Prerequisites

+ +

Download example data

+ +

Related resources

+ +
+

View aligned reads using IGV

+

To view aligned reads using the Integrative Genomics Viewer (IGV), the SAM or BAM file must be coordinate-sorted and indexed.

+
    +
  1. Always load the reference genome first. Go to Genomes>Load Genome From Server or load from the drop-down menu in the upper left corner. Select Human (1kg, b37+decoy).
  2. +
  3. Load the data file. Go to File>Load from File and select 6491_snippet.bam. IGV automatically uses the corresponding 6491_snippet.bai index in the same folder.
  4. +
  5. Zoom in to see alignments. For our tutorial data, copy and paste 10:96,867,400-96,869,400 into the textbox at the top and press Go. A 2 kbp region of chromosome 10 comes into view as shown in the screenshot above.
  6. +
+

Alongside read data, IGV automatically generates a coverage track that sums the depth of reads for each genomic position.

+

Find a specific read and view as pairs

+ +
    +
  1. Right-click on the alignment track and Select by name. Copy and paste H0164ALXX140820:2:2107:7323:30703 into the read name textbox and press OK. IGV will highlight two reads corresponding to this query name in bold red.
  2. +
  3. Right-click on the alignment track and select View as pairs. The two highlighted reads will display in the same row connected by a line as shown in the screenshot.
  4. +
+

Because IGV holds in memory a limited set of data overlapping with the genomic interval in view (this is what makes IGV fast), the select by name feature also applies only to the data that you call into view. For example, we know this read has a secondary alignment on contig hs37d5 (hs37d5:10,198,000-10,200,000).

+
+

If you jump to this new region, is the read also highlighted in red?

+
+
+

Some tips

+

If you find IGV sluggish, download a Java Web Start jnlp version of IGV that allows more memory. The highest memory setting as of this writing is 10 GB (RAM) for machines with 64-bit Java. For the tutorial example data, the typical 2 GB allocation is sufficient.

+ +

To change display settings, check out either the Alignment Preferences panel or the Alignment track Pop-up menu. For persistent changes to your IGV display settings, use the Preferences panel. For track-by-track changes, use the Pop-up menus.

+

Default Alignment Preferences settings are tuned to genomic sequence libraries. Go to View>Preferences and make sure the settings under the Alignments tab allows you to view reads of interest, e.g. duplicate reads.

+ +

After loading data, adjust viewing modes specific to track type by right-clicking on a track to pop up a menu of options. For alignment tracks, these options are described here.

+
\ No newline at end of file diff --git a/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md b/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md new file mode 100644 index 000000000..0759ddcb8 --- /dev/null +++ b/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md @@ -0,0 +1,67 @@ +## Appendix to (howto) Discover variants with GATK + +http://gatkforums.broadinstitute.org/gatk/discussion/7870/appendix-to-howto-discover-variants-with-gatk + +

GATK TUTORIAL :: Variant Discovery :: Appendix

+

June 2016 - GATK 3.6

+

This document is an appendix to the GATK Tutorial :: Variant Discovery module worksheet. It contains a summary introduction to the scientific context of the tutorial.

+
+

Table of Contents

+
    +
  1. GATK BEST PRACTICES
  2. +
  3. WHAT IS JOINT ANALYSIS?
  4. +
  5. FLAWS OF JOINT ANALYSIS +3.1 The N+1 problem +3.2 Really bad scaling
  6. +
  7. THE GVCF WORKFLOW
  8. +
+
+

1 GATK BEST PRACTICES

+

The GATK Best Practices workflows provide step-by-step recommendations for performing variant discovery analysis in high-throughput sequencing (HTS) data. The following diagram illustrates the GATK Best Practices workflow for germline SNP and Indel discovery in whole genomes and exomes. It includes three phases: pre-processing, variant discovery, and callset refinement.

+ +

Figure 1: Best Practices workflow for germline SNP and Indel discovery in whole-genomes and exomes.

+

Pre-Processing starts from raw sequence data, either in FASTQ or uBAM format, and produces analysis-ready BAM files. Processing steps include alignment to a reference genome as well as some data cleanup operations to correct for technical biases and make the data suitable for analysis.

+

Variant Discovery starts from analysis-ready BAM files and produces a callset in VCF format. Processing involves identifying sites where one or more individuals display possible genomic variation, and applying filtering methods appropriate to the experimental design. The Best Practices version 3.x include key innovations that enable joint analysis of multiple samples in a way that is scalable and allows incremental processing of the sequencing data. Those innovations are the focus of this tutorial.

+

Callset Refinement starts and ends with a VCF callset. Processing involves using metadata such as previously validated callsets to assess and improve genotyping accuracy, attach additional information and evaluate the overall quality of the callset. + +Learn more about the GATK Best Practices here.

+
+

2 WHAT IS JOINT ANALYSIS?

+

In this context, joint analysis means that we consider evidence from multiple samples in order to determine the genotype of each sample at each site, rather than looking at only one sample at a time in isolation. Considering evidence from multiple samples empowers variant discovery and allows us to detect variants with great sensitivity and genotype samples as accurately as possible. Specifically, we have determined that joint analysis conveys the following benefits:

+ +

There are specific data contexts in which performing joint analysis makes an especially important difference. Two such cases are illustrated below.

+ +

Figure 2: Two cases where joint analysis provides important information that improves either the genotype determination or the interpretation of results.

+

Left: Power of joint analysis in finding mutations at low coverage sites. The variant allele is present in only two of the N samples, in both cases with such low coverage that the variant is not callable when processed separately. Joint calling allows evidence to be accumulated over all samples and renders the variant callable.

+

Right: Importance of joint analysis to square off the genotype matrix, using an example of two disease-relevant variants. If we call these samples independently and produce a variants-only output, neither sample will have records for these two sites, for different reasons: the first sample is homozygous reference while the second sample has no data. Therefore, merging the results from single sample calling will incorrectly treat both of these samples identically as being non-informative.

+

Learn more about joint analysis here.

+
+

3 FLAWS OF JOINT ANALYSIS

+ +

Traditionally, joint analysis was achieved by calling variants jointly across all sample BAMs at the same time, generating a single call set for the entire cohort in a single step.

+

However, that method suffers from two major flaws: the N+1 problem and really bad scaling.

+

3.1 The N+1 problem

+

When you’re getting a large-ish number of samples sequenced (especially clinical samples), you typically get them in small batches over an extended period of time. In the past, this was handled by doing batch calling, i.e. analyze the samples in batches and combine the resulting VCF callsets as they become available. But that’s not a true joint analysis, and it doesn’t give you the same significant gains that calling variants jointly can yield (on top of producing batch effects). If you wanted to do a true joint analysis using the multisample variant calling approach, you have to re-call all samples from scratch every time you get even one new sample sequence. And the more you add samples, the more computationally intensive it gets, bringing us to the next problem: really bad scaling.

+

3.2 Really bad scaling

+

Calling variants jointly across samples scales very badly. This is because the calculations involved in variant calling (especially by sophisticated tools like the HaplotypeCaller that perform a graph assembly step) become exponentially more computationally costly as you add samples to the cohort. If you don't have a lot of compute available, you run into limitations very quickly. Even at Broad, where we have fairly ridiculous amounts of compute available, we can't brute-force our way through the numbers for the large cohort sizes that we're called on to handle like the 92,000 exomes of the ExAC dataset (see this page).

+
+

4 THE GVCF WORKFLOW

+

The good news is that you don’t actually have to call variants on all your samples together to perform a joint analysis. We have developed a workflow that allows us to decouple the initial identification of potential variant sites, i.e. the variant calling, from the genotyping step, which is the only part that really needs to be done jointly. Since GATK 3.0, you can use the HaplotypeCaller to call variants individually per-sample in a special mode invoked by adding -ERC GVCF to your command line, generating an intermediate file called a GVCF (for Genomic VCF). You then run a joint genotyping step on all the GVCF files generated for the samples in the cohort. This achieves what we call incremental joint discovery, providing you with all the benefits of classic joint calling (as described below) without the drawbacks.

+ +

+ + +Figure 4. The new approach to joint analysis allows incremental processing of samples and scales much better than the traditional approach of calling variants on all samples simultaneously. + + + + +

+
+

Conclusion

+

This uniquely innovative workflow solves both the scaling problems and the N+1 problem that plague traditional methods of joint analysis.

+

From here on out we will refer to this single-sample calling + joint genotyping workflow as the GVCF workflow because it involves the intermediate GVCF file, which uniquely distinguishes it from other methods.

\ No newline at end of file diff --git a/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md b/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md new file mode 100644 index 000000000..7db3b835b --- /dev/null +++ b/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md @@ -0,0 +1,98 @@ +## Tutorial files provenance: ASHG15 + +http://gatkforums.broadinstitute.org/gatk/discussion/6760/tutorial-files-provenance-ashg15 + +

This document is intended to be a record of how the tutorial files were prepared for the AHSG 2015 hands-on workshop.

+
+

Reference genome

+

This produces a 64 Mb file (uncompressed) which is small enough for our purposes, so we don't need to truncate it further, simplifying future data file preparations.

+
# Extract just chromosome 20
+samtools faidx /humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta 20 > human_g1k_b37_20.fasta
+
+# Create the reference index
+samtools faidx human_g1k_b37_20.fasta
+
+# Create sequence dictionary
+java -jar $PICARD CreateSequenceDictionary R=human_g1k_b37_20.fasta O=human_g1k_b37_20.dict
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga      164 Oct  1 14:56 human_g1k_b37_20.dict
+-rw-rw-r-- 1 vdauwera wga 64075950 Oct  1 14:41 human_g1k_b37_20.fasta
+-rw-rw-r-- 1 vdauwera wga       20 Oct  1 14:46 human_g1k_b37_20.fasta.fai
+
+

Sequence data

+

We are using the 2nd generation CEU Trio of NA12878 and her husband and child in a WGS dataset produced at Broad with files names after the library preps, Solexa-xxxxxx.bam.

+

1. Extract just chromosome 20:10M-20M bp and filter out chimeric pairs with -rf BadMate

+
java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272221.bam -o NA12877_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272222.bam -o NA12878_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272228.bam -o NA12882_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga     36240 Oct  2 11:55 NA12877_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 512866085 Oct  2 11:55 NA12877_wgs_20_10M20M.bam
+-rw-rw-r-- 1 vdauwera wga     36176 Oct  2 11:53 NA12878_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 502282846 Oct  2 11:53 NA12878_wgs_20_10M20M.bam
+-rw-rw-r-- 1 vdauwera wga     36464 Oct  2 12:00 NA12882_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 505001668 Oct  2 12:00 NA12882_wgs_20_10M20M.bam
+

2. Extract headers and edit manually to remove all contigs except 20 and sanitize internal filepaths

+
samtools view -H NA12877_wgs_20_10M20M.bam > NA12877_header.txt
+
+samtools view -H NA12878_wgs_20_10M20M.bam > NA12878_header.txt
+
+samtools view -H NA12882_wgs_20_10M20M.bam > NA12882_header.txt
+

Manual editing is not represented here; basically just delete unwanted contig SQ lines and remove identifying info from internal filepaths.

+

3. Flip BAM to SAM

+
java -jar $PICARD SamFormatConverter I=NA12877_wgs_20_10M20M.bam O=NA12877_wgs_20_10M20M.sam
+
+java -jar $PICARD SamFormatConverter I=NA12878_wgs_20_10M20M.bam O=NA12878_wgs_20_10M20M.sam
+
+java -jar $PICARD SamFormatConverter I=NA12882_wgs_20_10M20M.bam O=NA12882_wgs_20_10M20M.sam
+
+#Recap files
+-rw-rw-r-- 1 vdauwera wga 1694169101 Oct  2 12:28 NA12877_wgs_20_10M20M.sam
+-rw-rw-r-- 1 vdauwera wga 1661483309 Oct  2 12:30 NA12878_wgs_20_10M20M.sam
+-rw-rw-r-- 1 vdauwera wga 1696553456 Oct  2 12:31 NA12882_wgs_20_10M20M.sam
+

4. Re-header the SAMs

+
java -jar $PICARD ReplaceSamHeader I=NA12877_wgs_20_10M20M.sam O=NA12877_wgs_20_10M20M_RH.sam HEADER=NA12877_header.txt
+
+java -jar $PICARD ReplaceSamHeader I=NA12878_wgs_20_10M20M.sam O=NA12878_wgs_20_10M20M_RH.sam HEADER=NA12878_header.txt    
+
+java -jar $PICARD ReplaceSamHeader I=NA12882_wgs_20_10M20M.sam O=NA12882_wgs_20_10M20M_RH.sam HEADER=NA12882_header.txt    
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga 1694153715 Oct  2 12:35 NA12877_wgs_20_10M20M_RH.sam
+-rw-rw-r-- 1 vdauwera wga 1661467923 Oct  2 12:37 NA12878_wgs_20_10M20M_RH.sam
+-rw-rw-r-- 1 vdauwera wga 1696538104 Oct  2 12:38 NA12882_wgs_20_10M20M_RH.sam
+

5. Sanitize the SAMs to get rid of MATE_NOT_FOUND errors

+
java -jar $PICARD RevertSam I=NA12877_wgs_20_10M20M_RH.sam O=NA12877_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+java -jar $PICARD RevertSam I=NA12878_wgs_20_10M20M_RH.sam O=NA12878_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+java -jar $PICARD RevertSam I=NA12882_wgs_20_10M20M_RH.sam O=NA12882_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga 1683827201 Oct  2 12:45 NA12877_wgs_20_10M20M_RS.sam
+-rw-rw-r-- 1 vdauwera wga 1652093793 Oct  2 12:49 NA12878_wgs_20_10M20M_RS.sam
+-rw-rw-r-- 1 vdauwera wga 1688143091 Oct  2 12:54 NA12882_wgs_20_10M20M_RS.sam
+

6. Sort the SAMs, convert back to BAM and create index

+
java -jar $PICARD SortSam I=NA12877_wgs_20_10M20M_RS.sam O=NA12877_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+java -jar $PICARD SortSam I=NA12878_wgs_20_10M20M_RS.sam O=NA12878_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+java -jar $PICARD SortSam I=NA12882_wgs_20_10M20M_RS.sam O=NA12882_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+#recap files
+-rw-rw-r-- 1 vdauwera wga     35616 Oct  2 13:08 NA12877_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 508022682 Oct  2 13:08 NA12877_wgs_20_10M20M_V.bam
+-rw-rw-r-- 1 vdauwera wga     35200 Oct  2 13:06 NA12878_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 497742417 Oct  2 13:06 NA12878_wgs_20_10M20M_V.bam
+-rw-rw-r-- 1 vdauwera wga     35632 Oct  2 13:04 NA12882_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 500446729 Oct  2 13:04 NA12882_wgs_20_10M20M_V.bam
+

7. Validate BAMs; should all output "No errors found"

+
java -jar $PICARD ValidateSamFile I=NA12877_wgs_20_10M20M_V.bam M=SUMMARY
+
+java -jar $PICARD ValidateSamFile I=NA12878_wgs_20_10M20M_V.bam M=SUMMARY
+
+java -jar $PICARD ValidateSamFile I=NA12882_wgs_20_10M20M_V.bam M=SUMMARY
\ No newline at end of file