RNA-seq data analysis on S. epidermidis 1585

gene_x 0 like s 580 view s

Tags: pipeline, RNA-seq

  1. prepare input reads and samplesheet

    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_A/EX_15_min_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_A/EX_15_min_A_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_B/EX_15_min_B_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_B/EX_15_min_B_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_C/EX_15_min_C_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_15_min_C/EX_15_min_C_2.fq.gz .
    
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_A/EX_1h_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_A/EX_1h_A_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_B/EX_1h_B_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_B/EX_1h_B_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_C/EX_1h_C_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_1h_C/EX_1h_C_2.fq.gz .
    
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_A/EX_2h_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_A/EX_2h_A_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_B/EX_2h_B_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_B/EX_2h_B_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_C/EX_2h_C_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_2h_C/EX_2h_C_2.fq.gz .
    
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_A/EX_4h_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_A/EX_4h_A_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_B/EX_4h_B_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_B/EX_4h_B_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_C/EX_4h_C_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_4h_C/EX_4h_C_2.fq.gz .
    
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_6h_A/EX_6h_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_6h_A/EX_6h_A_2.fq.gz .
    ln -s ./F23A430001245-01_SEPzqcuP/soapnuke/clean/EX_6h_B/EX_6h_B_1.fq.gz .
    ln -s ./F23A430001245-01_SEPzqcuP/soapnuke/clean/EX_6h_B/EX_6h_B_2.fq.gz .
    ln -s ./F23A430001245-01_SEPzqcuP/soapnuke/clean/EX_6h_C/EX_6h_C_1.fq.gz .
    ln -s ./F23A430001245-01_SEPzqcuP/soapnuke/clean/EX_6h_C/EX_6h_C_2.fq.gz .
    
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_A/EX_Day_4_A_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_A/EX_Day_4_A_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_B/EX_Day_4_B_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_B/EX_Day_4_B_2.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_C/EX_Day_4_C_1.fq.gz .
    ln -s ./F23A430001245_SEPiovzP/soapnuke/clean/EX_Day_4_C/EX_Day_4_C_2.fq.gz .
    
    sample,fastq_1,fastq_2,strandedness
    EX_15_min_A,EX_15_min_A_1.fq.gz,EX_15_min_A_2.fq.gz,auto
    EX_15_min_B,EX_15_min_B_1.fq.gz,EX_15_min_B_2.fq.gz,auto
    EX_15_min_C,EX_15_min_C_1.fq.gz,EX_15_min_C_2.fq.gz,auto
    
  2. Notes and Debugs

    #-- NOTE1: STAR cannot work simoutenously twice at the same time! --
    
    #-- NOTE2 --
    The GTF file might be corrupted!
      Stop at line : NC_019234.1    RefSeq  exon    1600    1977    .   +   0   ID "cds-HXG45_RS00010"; Parent "gene-HXG45_RS00010"; Note "incomplete%3B partial in the middle of a contig%3B missing N-terminus and C-terminus"; Ontology_term "GO:0015074"; end_range "1977,."; gbkey "CDS"; go_process "DNA integration|0015074||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_010972061.1"; locus_tag "HXG45_RS00010"; partial "true"; product "DDE-type integrase/transposase/recombinase"; pseudo "true"; start_range ".,1600"; transl_table "11"
      Error Message: Cannot find gene_id!
    
    #cut -d$'\t' -f2 RP62A.gtf | sort -u
    cmsearch
    GeneMarkS-2+
    Protein Homology --> ProteinHomology
    RefSeq
    tRNAscan-SE
    
    #cut -d$'\t' -f3 RP62A.gtf | sort -u
    gene
    pseudogene
    
    CDS
    rRNA
    tRNA
    
    direct_repeat
    exon
    ncRNA
    binding_site
    region
    riboswitch
    RNase_P_RNA
    
    sequence_feature
    SRP_RNA
    tmRNA
    
    #-- BUG1: mamba update picard --
    #2.18.27-SNAPSHOT --> bioconda::picard=3.0.0
    conda install -c conda-forge mamba
    mamba update picard
      - ca-certificates   2023.7.22  hbcca054_0     conda-forge                    
      + ca-certificates  2023.11.17  hbcca054_0     conda-forge/linux-64     Cached
      - nextflow            20.04.1  hecda079_1     bioconda                       
      + nextflow            23.10.0  hdfd78af_0     bioconda/noarch          Cached
      - openjdk              11.0.1  h516909a_1016  conda-forge                    
      + openjdk              17.0.3  h58dac75_5     conda-forge/linux-64     Cached
      - picard                3.0.0  hdfd78af_0     bioconda                       
      + picard                3.1.1  hdfd78af_0     bioconda/noarch          Cached
    mamba update rsem
    
    #-- BUG2: mamba update gffread --
    cd /mnt/h1/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/work/b4/703a2595f234c7866e49d9a033f943
    gffread \
          RP62A.gff \
          --keep-exon-attrs -F -T \
          -o RP62A.gtf
      - gffread   0.9.12  0           bioconda                    
      + gffread   0.12.7  hdcf5f25_3  bioconda/linux-64     Cached
    
    #-- NOTE: If you use nf-core/rnaseq for your analysis please cite: --
    #* The pipeline
    #  https://doi.org/10.5281/zenodo.1400710
    #* The nf-core framework
    #  https://doi.org/10.1038/s41587-020-0439-x
    #* Software dependencies
    #  https://github.com/nf-core/rnaseq/blob/master/CITATIONS.md
    
    #-- BUG3: under rnaseq_2021 open R --
    #using R host rather than conda R
    mamba remove r-base
    #-I"/home/jhuang/miniconda3/envs/rnaseq_2021/lib/R/include"
    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("SummarizedExperiment")
    BiocManager::install("tximport")
    #Error in citation("tximeta") : there is no package called ‘tximeta’
    #  Execution halted
    
    #sudo apt-get update
    #sudo apt-get install libcurl4-openssl-dev
    
    #BiocManager::install(version = "3.16")
    BiocManager::install(c("AnnotationDbi", "httr", "BiocFileCache", "biomaRt", "GenomicFeatures", "ensembldb", "curl", "AnnotationHub"))
    BiocManager::install(c("biomaRt", "GenomicFeatures", "ensembldb", "AnnotationHub", "tximeta"))
    BiocManager::install(c("tximeta"))
    
    #conda update -n base -c defaults conda
    #conda update --all
    
    #!/usr/bin/env Rscript
    /mnt/h1/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/rnaseq/bin/salmon_tximport.r \
          NULL \
          salmon \
          salmon.merged
    
  3. prepare gtf file

    #  --gtf_extra_attributes        [string]  By default, the pipeline uses the `gene_name` field to obtain additional gene identifiers from the input GTF file when 
                                              running Salmon. [default: gene_name] 
    #  --gtf_group_features          [string]  Define the attribute type used to group features in the GTF file when running Salmon. [default: gene_id]
    #  --featurecounts_group_type    [string]  The attribute type used to group feature types in the GTF file when generating the biotype plot with 
                                              featureCounts. [default: gene_biotype] 
    #  --featurecounts_feature_type  [string]  By default, the pipeline assigns reads based on the 'exon' attribute within the GTF file. [default: exon]
    #--gtf_extra_attributes gene 
    #--gtf_group_features Parent --featurecounts_group_type gene_biotype --featurecounts_feature_type CDS (outdated since [CHANGE1,2,3])
    
    #under hamm
    conda activate rnaseq
    ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    
    # ------------ gff_to_gtf.py, then modify *.gtf file as follows -------------
    #Upload the scripts gff_to_gtf.py and modify_gtf.py.
    python3 gff_to_gtf.py RP62A.gff3 RP62A.gtf
    
    #[CHANGE1] ID in gene and Parent in CDS --> gene_id; ID in exon --> transcript_id; Name in gene --> gene_name
    #[CHANGE2] "Protein Homology" --> "RefSeq"
    #[CHANGE3] CDS --> exon
    #[CHANGE4?] --gtf_extra_attributes ||||gene|||| refers to ID=gene-SERP_RS00220;Name=noc;gbkey=Gene;||||gene=noc||||; "gene=" exists in both gene and exon. choose gene as gtf_extra_attributes! (see --gtf_extra_attributes gene in the next command)
    
    [1] python3 gff_to_gtf.py RP62A.gff3 RP62A.gtf
    [2] python3 modify_gtf.py RP62A.gtf RP62A_m.gtf
    [3] sed -i -e 's/gene_id "rna-/gene_id "gene-/g' RP62A_m.gtf
    #NOTE that "ProteinHomology exon" are from "ProteinHomology CDS" 
    [4] MANUALLY checking why two are missing, because the gene has two CDS/exons.
    #(rnaseq) jhuang@hamm:/mnt/h1/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results/genome$ grep ">cds" results/genome/genome.transcripts.fa  | sed 's/>//g' - | sort > CDS_2.txt
    #2415
    #(rnaseq) jhuang@hamm:/mnt/h1/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results/genome$ grep ">rna" genome.transcripts.fa  | wc -l
    #84
    #cut -d$'\t' -f9 CDS_1.txt | cut -d';' -f1 | sed 's/ID=//g' - | sort > CDS_1_.txt
    #2417
    #2268d2267
    #< cds-WP_010959142.1
    #2401d2399
    #< cds-WP_161384733.1
    
    NC_002976.3     RefSeq  gene    423814  424930  .       +       .       gene_id "gene-SERP_RS02220"; gene_name "prfB"; gbkey "Gene"; gene "prfB"; gene_biotype "protein_coding"; locus_tag "SERP_RS02220"; old_locus_tag "SE0421%2CSERP0421"
    NC_002976.3     ProteinHomology exon    423814  423885  .       +       0       transcript_id "cds-WP_010959142.1"; gene_id "gene-SERP_RS02220"; Dbxref "GenBank:WP_010959142.1"; Name "WP_010959142.1"; Note "programmed frameshift"; Ontology_term "GO:0006415,GO:0003747"; exception "ribosomal slippage"; gbkey "CDS"; gene "prfB"; go_function "translation release factor activity|0003747||IEA"; go_process "translational termination|0006415||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_010959142.1"; locus_tag "SERP_RS02220"; product "peptide chain release factor 2"; protein_id "WP_010959142.1"; transl_table "11"
    NC_002976.3     ProteinHomology exon    423887  424930  .       +       0       transcript_id "cds-WP_010959142.1"; gene_id "gene-SERP_RS02220"; Dbxref "GenBank:WP_010959142.1"; Name "WP_010959142.1"; Note "programmed frameshift"; Ontology_term "GO:0006415,GO:0003747"; exception "ribosomal slippage"; gbkey "CDS"; gene "prfB"; go_function "translation release factor activity|0003747||IEA"; go_process "translational termination|0006415||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_010959142.1"; locus_tag "SERP_RS02220"; product "peptide chain release factor 2"; protein_id "WP_010959142.1"; transl_table "11"
    NC_002976.3     RefSeq  gene    2256196 2256869 .       +       .       gene_id "gene-SERP_RS10980"; gene_name "SERP_RS10980"; gbkey "Gene"; gene_biotype "protein_coding"; locus_tag "SERP_RS10980"; old_locus_tag "SERP2219"
    NC_002976.3     ProteinHomology exon    2256196 2256385 .       +       0       transcript_id "cds-WP_161384733.1"; gene_id "gene-SERP_RS10980"; Dbxref "GenBank:WP_161384733.1"; Name "WP_161384733.1"; Note "programmed frameshift"; Ontology_term "GO:0004803"; exception "ribosomal slippage"; gbkey "CDS"; go_function "transposase activity|0004803||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_064305258.1"; locus_tag "SERP_RS10980"; product "IS6-like element IS257 family transposase"; protein_id "WP_161384733.1"; transl_table "11"
    NC_002976.3     ProteinHomology exon    2256385 2256869 .       +       2       transcript_id "cds-WP_161384733.1"; gene_id "gene-SERP_RS10980"; Dbxref "GenBank:WP_161384733.1"; Name "WP_161384733.1"; Note "programmed frameshift"; Ontology_term "GO:0004803"; exception "ribosomal slippage"; gbkey "CDS"; go_function "transposase activity|0004803||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_064305258.1"; locus_tag "SERP_RS10980"; product "IS6-like element IS257 family transposase"; protein_id "WP_161384733.1"; transl_table "11"
    
    --> "cds-WP_010959142.1" and "cds-WP_161384733.1" occur twice!
    
    NC_002976.3     RefSeq  gene    423814  424930  .       +       .       gene_id "gene-SERP_RS02220"; gene_name "prfB"; gbkey "Gene"; gene "prfB"; gene_biotype "protein_coding"; locus_tag "SERP_RS02220"; old_locus_tag "SE0421%2CSERP0421"
    NC_002976.3     ProteinHomology exon    423814  424930  .       +       0       transcript_id "cds-WP_010959142.1"; gene_id "gene-SERP_RS02220"; Dbxref "GenBank:WP_010959142.1"; Name "WP_010959142.1"; Note "programmed frameshift"; Ontology_term "GO:0006415,GO:0003747"; exception "ribosomal slippage"; gbkey "CDS"; gene "prfB"; go_function "translation release factor activity|0003747||IEA"; go_process "translational termination|0006415||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_010959142.1"; locus_tag "SERP_RS02220"; product "peptide chain release factor 2"; protein_id "WP_010959142.1"; transl_table "11"
    NC_002976.3     RefSeq  gene    2256196 2256869 .       +       .       gene_id "gene-SERP_RS10980"; gene_name "SERP_RS10980"; gbkey "Gene"; gene_biotype "protein_coding"; locus_tag "SERP_RS10980"; old_locus_tag "SERP2219"
    NC_002976.3     ProteinHomology exon    2256196 2256869 .       +       0       transcript_id "cds-WP_161384733.1"; gene_id "gene-SERP_RS10980"; Dbxref "GenBank:WP_161384733.1"; Name "WP_161384733.1"; Note "programmed frameshift"; Ontology_term "GO:0004803"; exception "ribosomal slippage"; gbkey "CDS"; go_function "transposase activity|0004803||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_064305258.1"; locus_tag "SERP_RS10980"; product "IS6-like element IS257 family transposase"; protein_id "WP_161384733.1"; transl_table "11"
    
    --> "cds-WP_010959142.1" and "cds-WP_161384733.1" occur once!
    
    ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    
    (rnaseq) nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results --fasta RP62A.fasta --gtf RP62A_m.gtf -profile test_full -resume --max_memory 100.GB --max_time 2400.h --save_reference --aligner star_salmon             --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0  --gtf_extra_attributes gene
    #grep ">" genome.transcripts.fa  | wc -l --> 2499  (2,529= CDSs (2,445) + RNA(84))
    
    #-- Ben project: human + bacterium + plasmid --
    .gff3
    python3 gff_to_gtf.py CP009367.gff3 CP009367.gtf
    python3 modify_gtf.py CP009367.gtf CP009367_m.gtf
    sed -i -e 's/gene_id "rna-/gene_id "gene-/g' CP009367_m.gtf
    
    python3 gff_to_gtf.py NC_019234.gff3 NC_019234.gtf
    python3 modify_gtf.py NC_019234.gtf NC_019234_m.gtf
    sed -i -e 's/gene_id "rna-/gene_id "gene-/g' NC_019234_m.gtf
    
    python3 genbank_to_gtf.py 1585.gb 1585.gtf
    python3 modify_gtf.py 1585.gtf 1585_m.gtf
    #python3 correct_gtf.py
    python3 transform_gtf.py 1585_m.gtf 1585_m_.gtf
    sed -i -e 's/transcript_id \"/transcript_id \"tx-/g' 1585_m_.gtf
    sed -i -e 's/exon_id \"/exon_id \"exon-/g' 1585_m_.gtf
    #BUG: why 7052 >= expected 7014
    
    #-- MANUALLY: gene positions out of boundary --
    #Transcript cds-WP_011100764.1 is out of chromosome NC_019234.1's boundary!
    NC_019234.1     RefSeq  gene    66549   67571   .       +       .       gene_id "gene-HXG45_RS00005"; gene_name "repA"; gbkey "Gene"; gene "repA"; gene_biotype "protein_coding"; locus_tag "HXG45_RS00005"; old_locus_tag "D743_p54"
    NC_019234.1     RefSeq  exon    66549   67571   .       +       0       transcript_id "cds-WP_011100764.1"; gene_id "gene-HXG45_RS00005"; Dbxref "GenBank:WP_011100764.1"; Name "WP_011100764.1"; gbkey "CDS"; gene "repA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_011100764.1"; locus_tag "HXG45_RS00005"; product "plasmid replication initiator RepA"; protein_id "WP_011100764.1"; transl_table "11"
    #-->
    NC_019234.1 RefSeq  gene    66549   66845   .   +   .   gene_id "gene-HXG45_RS00005_1"; gene_name "repA"; gbkey "Gene"; gene "repA"; gene_biotype "protein_coding"; locus_tag "HXG45_RS00005"; old_locus_tag "D743_p54"
    NC_019234.1 RefSeq  exon    66549   66845   .   +   0   transcript_id "cds-WP_011100764.1_1"; gene_id "gene-HXG45_RS00005_1"; Dbxref "GenBank:WP_011100764.1"; Name "WP_011100764.1"; gbkey "CDS"; gene "repA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_011100764.1"; locus_tag "HXG45_RS00005"; product "plasmid replication initiator RepA"; protein_id "WP_011100764.1"; transl_table "11"
    NC_019234.1 RefSeq  gene    1   726 .   +   .   gene_id "gene-HXG45_RS00005_2"; gene_name "repA"; gbkey "Gene"; gene "repA"; gene_biotype "protein_coding"; locus_tag "HXG45_RS00005"; old_locus_tag "D743_p54"
    NC_019234.1 RefSeq  exon    1   726 .   +   0   transcript_id "cds-WP_011100764.1_2"; gene_id "gene-HXG45_RS00005_2"; Dbxref "GenBank:WP_011100764.1"; Name "WP_011100764.1"; gbkey "CDS"; gene "repA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_011100764.1"; locus_tag "HXG45_RS00005"; product "plasmid replication initiator RepA"; protein_id "WP_011100764.1"; transl_table "11"
    
    (rnaseq) nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_NC_019234 --fasta NC_019234.fasta --gtf NC_019234_m.gtf -profile test_full -resume --max_memory 100.GB --max_time 2400.h --save_reference --aligner star_salmon             --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0  --gtf_extra_attributes gene
    nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_CP009367 --fasta CP009367.fasta --gtf CP009367_m.gtf -profile test_full -resume --max_memory 100.GB --max_time 2400.h --save_reference --aligner star_salmon             --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0  --gtf_extra_attributes gene
    nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 100.GB --max_time 2400.h --save_align_intermeds --save_unaligned --aligner star_salmon --pseudo_aligner salmon
    nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_1585 --fasta 1585.fasta --gtf 1585_m_.gtf -profile test_full -resume --max_memory 100.GB --max_time 2400.h --save_reference --aligner star_salmon --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0
    
    #TODO: look if gene-HXG45_RS00010 exist!
    
    # -- check the intermediate results --
    #vim salmon.merged.gene_counts.tsv
    gene_id gene_name       EX_15_min_A
    gene-SERP_RS00195       rpmH    298
    gene-SERP_RS00200       rnpA    90
    gene-SERP_RS00205       mnmE    1733
    
    #vim salmon.merged.transcript_counts.tsv
    tx      gene_id EX_15_min_A
    cds-WP_000240855.1      gene-SERP_RS00195       298
    cds-WP_001832522.1      gene-SERP_RS00200       90
    cds-WP_001831768.1      gene-SERP_RS00205       1733
    
  4. gff_to_gtf.py (code)

    import sys
    
    def gff3_to_gtf(gff3_file, gtf_file):
        with open(gff3_file, 'r') as gff3, open(gtf_file, 'w') as gtf:
            for line in gff3:
                if line.startswith('#'):
                    continue  # Skip header lines
    
                parts = line.strip().split('\t')
                if len(parts) != 9:
                    continue  # Skip invalid lines
    
                # Extract fields from GFF3
                seqname, source, feature, start, end, score, strand, frame, attributes = parts
    
                # Convert attributes to GTF format
                attr_pairs = [f.strip() for f in attributes.split(';') if f.strip() != '']
                attr_str = '; '.join([f'{key} "{value}"' for part in attr_pairs for key, value in [part.split('=')]])
    
                # Write to GTF file
                gtf_line = f'{seqname}\t{source}\t{feature}\t{start}\t{end}\t{score}\t{strand}\t{frame}\t{attr_str};\n'
                gtf.write(gtf_line)
    
    if __name__ == "__main__":
        if len(sys.argv) != 3:
            print("Usage: python script.py <input.gff3> <output.gtf>")
            sys.exit(1)
    
        gff3_file = sys.argv[1]
        gtf_file = sys.argv[2]
        gff3_to_gtf(gff3_file, gtf_file)
    
  5. genbank_to_gtf.py (code)

    import argparse
    from Bio import SeqIO
    
    def genbank_to_gtf(genbank_file, gtf_file):
        with open(genbank_file, "r") as input_handle, open(gtf_file, "w") as output_handle:
            for record in SeqIO.parse(input_handle, "genbank"):
                for feature in record.features:
                    if feature.type == "gene" or feature.type == "CDS":
                        start = feature.location.start.position + 1
                        end = feature.location.end.position
                        strand = '+' if feature.location.strand == 1 else '-'
                        attributes = []
    
                        gene_id = None
                        if "gene_id" in feature.qualifiers:
                            gene_id = feature.qualifiers["gene_id"][0]
                        elif "gene" in feature.qualifiers:
                            gene_id = feature.qualifiers["gene"][0]
                        elif "locus_tag" in feature.qualifiers:
                            gene_id = feature.qualifiers["locus_tag"][0]
    
                        if gene_id is not None:
                            attributes.append(f'gene_id "{gene_id}"')
    
                        transcript_id = None
                        if "locus_tag" in feature.qualifiers:
                            transcript_id = feature.qualifiers["locus_tag"][0]
                            attributes.append(f'transcript_id "{transcript_id}"')
    
                        attribute_str = "; ".join(attributes) + ";"
                        gtf_line = f"{record.id}\t.\t{feature.type}\t{start}\t{end}\t.\t{strand}\t.\t{attribute_str}\n"
                        output_handle.write(gtf_line)
    
    def main():
        parser = argparse.ArgumentParser(description="Convert GenBank file to GTF format.")
        parser.add_argument("genbank_file", help="Input GenBank file")
        parser.add_argument("gtf_file", help="Output GTF file")
        args = parser.parse_args()
    
        genbank_to_gtf(args.genbank_file, args.gtf_file)
    
    if __name__ == "__main__":
        main()
    
  6. modify_gtf.py (code)

    import sys
    
    def modify_gtf(input_file, output_file):
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            for line in infile:
                if line.startswith('#'):
                    outfile.write(line)  # Copy comment lines
                    continue
    
                parts = line.strip().split('\t')
                if len(parts) != 9:
                    continue  # Skip invalid lines
    
                record_type = parts[2]
                attributes = parts[8]
    
                # Change "Protein Homology" to "RefSeq"
                if parts[1] == "Protein Homology":
                    parts[1] = "ProteinHomology"
    
                new_attributes = []
                for attr in attributes.split(';'):
                    if attr.strip():
                        key_value = attr.strip().split(' ', 1)  # Split only on the first space
                        if len(key_value) == 2:
                            key, value = key_value
                            if (record_type == 'gene' or record_type == 'pseudogene') and key == 'ID':
                                key = 'gene_id'
                            elif (record_type == 'gene' or record_type == 'pseudogene') and key == 'Name':
                                key = 'gene_name'
                            elif (record_type == 'CDS' or record_type == 'tRNA' or record_type == 'rRNA' or record_type == 'SRP_RNA') and key == 'ID':
                                key = 'transcript_id'
                            elif (record_type == 'CDS' or record_type == 'tRNA' or record_type == 'rRNA' or record_type == 'SRP_RNA') and key == 'Parent':
                                key = 'gene_id'
                            elif (record_type == 'exon') and key == 'ID':
                                key = 'exon_id'
                            elif (record_type == 'exon') and key == 'Parent':
                                key = 'transcript_id'
                                new_attributes.append(f'gene_id {value}')
                            new_attributes.append(f'{key} {value}')
                        else:
                            new_attributes.append(attr.strip())
    
                # Change record type from CDS to exon
                if record_type == "CDS":
                    parts[2] = "exon"
    
                parts[8] = '; '.join(new_attributes)
                outfile.write('\t'.join(parts) + '\n')
    
    if __name__ == "__main__":
        if len(sys.argv) != 3:
            print("Usage: python modify_gtf.py <input_file> <output_file>")
            sys.exit(1)
    
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        modify_gtf(input_file, output_file)
    
  7. correct_gtf.py (code)

    input_file = '1585_m.gtf'  # Replace with your input file path
    output_file = '1585_m_.gtf'  # Replace with your desired output file path
    
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.strip() and not line.startswith('#'):  # Skip empty or comment lines
                parts = line.strip().split('\t')
                if parts[2] == 'gene':  # Check if the feature type is 'gene'
                    attributes = parts[8].split(';')
                    attributes = [attr.strip() for attr in attributes if 'transcript_id' not in attr]  # Remove transcript_id for gene
                    parts[8] = '; '.join(attributes) + ';' if attributes else '.'
                    line = '\t'.join(parts) + '\n'
            outfile.write(line)
    
    print(f"Processed file saved as {output_file}")
    
  8. transform_gtf.py (code)

    import sys
    
    def transform_data(input_data):
        output_data = []
    
        for line in input_data:
            fields = line.split()
            entry_type = fields[2]
            attributes = ' '.join(fields[8:])
            attributes_dict = dict(item.split(' ', 1) for item in attributes.split('; '))
    
            gene_id = attributes_dict.get('gene_id').replace('"', '')
            transcript_id = attributes_dict.get('transcript_id').replace('"', '')
    
            if not gene_id.startswith("SF028"):
                gene_name = f'gene_name "{gene_id}"'
                gene_id = f'gene_id "{transcript_id}"'
            else:
                gene_name = ''
    
            if entry_type == "gene":
                gene_line = f"{fields[0]}\t{fields[1]}\tgene\t{fields[3]}\t{fields[4]}\t{fields[5]}\t{fields[6]}\t{fields[7]}\t{gene_id}; {gene_name}".strip()
                output_data.append(gene_line)
                transcript_line = f"{fields[0]}\t{fields[1]}\ttranscript\t{fields[3]}\t{fields[4]}\t{fields[5]}\t{fields[6]}\t{fields[7]}\t{gene_id}; transcript_id \"{transcript_id}\""
                output_data.append(transcript_line)
            elif entry_type == "exon":
                exon_line = f"{fields[0]}\t{fields[1]}\texon\t{fields[3]}\t{fields[4]}\t{fields[5]}\t{fields[6]}\t{fields[7]}\t{gene_id}; transcript_id \"{transcript_id}\"; exon_id \"{transcript_id}\""
                output_data.append(exon_line)
    
        return output_data
    
    def main(input_file, output_file):
        with open(input_file, 'r') as file:
            input_data = file.readlines()
    
        transformed_data = transform_data(input_data)
    
        with open(output_file, 'w') as file:
            for line in transformed_data:
                file.write(line + '\n')
    
    if __name__ == "__main__":
        if len(sys.argv) != 3:
            print("Usage: python script.py <input_file> <output_file>")
            sys.exit(1)
    
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        main(input_file, output_file)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum