mimseq
Advanced tools
| Metadata-Version: 2.1 | ||
| Name: mimseq | ||
| Version: 1.3.7 | ||
| Version: 1.3.8 | ||
| Summary: Custom high-throughput tRNA sequencing alignment and quantification pipeline based on modification induced misincorporation cDNA synthesis. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/nedialkova-lab/mim-tRNAseq |
@@ -9,2 +9,3 @@ biopython | ||
| pybedtools | ||
| requests | ||
| statsmodels |
@@ -32,3 +32,2 @@ LICENSE.txt | ||
| mimseq/data/modomics | ||
| mimseq/data/modomics_orig | ||
| mimseq/data/tRNAmatureseq.cm | ||
@@ -35,0 +34,0 @@ mimseq/data/araTha1-eColitK/FastaHeadersforMimseq.py |
+1
-4
@@ -97,8 +97,5 @@ #! /usr/bin/env python3 | ||
| map_round = 1 #first round of mapping | ||
| # Parse tRNA and modifications, generate SNP index | ||
| modifications = os.path.dirname(os.path.realpath(__file__)) | ||
| modifications += "/modifications" | ||
| coverage_bed, snp_tolerance, mismatch_dict, insert_dict, del_dict, mod_lists, Inosine_lists, Inosine_clusters, tRNA_dict, cluster_dict, cluster_perPos_mismatchMembers \ | ||
| = modsToSNPIndex(trnas, trnaout, mito_trnas, plastid_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod) | ||
| = modsToSNPIndex(gtRNAdb = trnas, tRNAscan_out = trnaout, mitotRNAs = mito_trnas, plastidtRNAs = plastid_trnas, experiment_name = name, out_dir = out, double_cca = double_cca, threads = threads, snp_tolerance = snp_tolerance, cluster = cluster, cluster_id = cluster_id, posttrans_mod_off = posttrans, pretrnas = pretrnas, local_mod = local_mod) | ||
| structureParser() | ||
@@ -105,0 +102,0 @@ # Generate GSNAP indices |
+132
-98
@@ -31,7 +31,8 @@ #!/usr/bin/env python3 | ||
| def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod): | ||
| def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, posttrans_mod_off, double_cca, pretrnas, local_mod): | ||
| # tRNA sequence files parser and dictionary building | ||
| # Generate modification reference table | ||
| modifications = modificationParser(modifications_table) | ||
| modifications_file, fetch = getModifications(local_mod) | ||
| modifications = modificationParser(modifications_file, fetch) | ||
| temp_name = gtRNAdb.split("/")[-1] | ||
@@ -218,3 +219,3 @@ | ||
| def getModomics(local_mod): | ||
| # Get full Modomics modified tRNA data from web | ||
| # Get full Modomics modified tRNA data from API | ||
| fetch = False | ||
@@ -230,16 +231,136 @@ if not local_mod: | ||
| log.error("Unable to connect to Modomics database! HTTP error: {}. Check status of Modomics webpage. Using local Modomics files...".format(http_err)) | ||
| modomics_path = os.path.dirname(os.path.realpath(__file__)) + '/data/modomics' | ||
| modomics = open(modomics_path, "r+", encoding = "utf-8") | ||
| modomics = openLocalModomics('/data/modomics') | ||
| except Exception as err: | ||
| log.error("Error in connecting to Modomics: {}. Using local Modomics files...".format(err)) | ||
| modomics_path = os.path.dirname(os.path.realpath(__file__)) + '/data/modomics' | ||
| modomics = open(modomics_path, "r+", encoding = "utf-8") | ||
| modomics = openLocalModomics('/data/modomics') | ||
| else: | ||
| log.warning("Retrieval of Modomics database disabled. Using local files instead...") | ||
| modomics_path = os.path.dirname(os.path.realpath(__file__)) + '/data/modomics' | ||
| modomics = open(modomics_path, "r+", encoding = "utf-8") | ||
| modomics = openLocalModomics('/data/modomics') | ||
| return modomics, fetch | ||
| def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, experiment_name, out_dir, double_cca, threads, snp_tolerance = False, cluster = False, cluster_id = 0.95, posttrans_mod_off = False, pretrnas = False, local_mod = False, search='usearch'): | ||
| def openLocalModomics(filepath): | ||
| # Open the local modomics file for reading | ||
| modomics_path = os.path.dirname(os.path.realpath(__file__)) + filepath | ||
| modomics = open(modomics_path, "r", encoding = "utf-8") | ||
| return modomics | ||
| def getModifications(local_mod): | ||
| # Get modification lookup table from Modomics via API | ||
| fetch = False | ||
| if not local_mod: | ||
| try: | ||
| response = requests.get("https://www.genesilico.pl/modomics/api/modifications") | ||
| response.raise_for_status() | ||
| modifications = response.json() | ||
| fetch = True | ||
| log.info("Modification table retrieved...") | ||
| except HTTPError as http_err: | ||
| log.error("Unable to connect to Modomics database! HTTP error: {}. Check status of Modomics webpage. Using local Modomics files...".format(http_err)) | ||
| modifications = openLocalModomics('/modifications') | ||
| except Exception as err: | ||
| log.error("Error in connecting to Modomics: {}. Using local Modomics files...".format(err)) | ||
| modifications = openLocalModomics('/modifications') | ||
| else: | ||
| log.warning("Retrieval of Modomics database disabled. Using local files instead...") | ||
| modifications = openLocalModomics('/modifications') | ||
| return modifications, fetch | ||
| def modificationParser(modifications_table, fetch): | ||
| # Read in modifications and build dictionary | ||
| modifications = {} | ||
| if fetch: | ||
| log.info("Parsing Modification JSON data...") | ||
| for data in modifications_table.values(): | ||
| modifications[data["abbrev"].strip()] = {'name':data["name"].strip(), 'abbr':data["short_name"].strip(), 'ref':data["reference_moiety"][0].strip()} | ||
| elif not fetch: | ||
| log.info("Parsing local Modification data...") | ||
| for line in modifications_table: | ||
| if not line.startswith("#"): | ||
| name, abbr, ref, mod = line.split('\t') | ||
| # replace unknown modifications with reference of N | ||
| if not ref or ref.isspace(): | ||
| ref = 'N' | ||
| if mod and not mod.isspace(): | ||
| modifications[mod.strip()] = {'name':name.strip(), 'abbr':abbr.strip(), 'ref':ref.strip()} | ||
| return(modifications) | ||
| def getUnmodSeq(seq, modification_table): | ||
| # Change modified bases into standard ACGT in input sequence | ||
| new_seq = [] | ||
| for char in seq: | ||
| # for insertions ('_') make reference N - this is not described in the modifications table | ||
| if char == '_': | ||
| char = 'N' | ||
| else: | ||
| char = modification_table[char]['ref'] | ||
| # Change queuosine to G (reference is preQ0base in modification file) | ||
| if char == 'preQ0base': | ||
| char = 'G' | ||
| new_seq.append(char) | ||
| new_seq = ''.join(new_seq) | ||
| new_seq = new_seq.replace('U','T') | ||
| return(new_seq) | ||
| def initIntronDict(tRNAscan_out): | ||
| # Build dictionary of intron locations | ||
| Intron_dict = {} | ||
| tRNAscan = open(tRNAscan_out, 'r') | ||
| intron_count = 0 | ||
| for line in tRNAscan: | ||
| if not line.startswith(("Sequence", "Name", "-")): | ||
| tRNA_ID = line.split()[0] + ".trna" + line.split()[1] | ||
| tRNA_start = int(line.split()[2]) | ||
| intron_start = int(line.split()[6]) | ||
| intron_stop = int(line.split()[7]) | ||
| # if inton boundaries are not 0, i.e. there is an intron then add to dict | ||
| if (intron_start > 0) & (intron_stop > 0): | ||
| if tRNA_start > intron_start: # tRNA is on reverse strand | ||
| intron_count += 1 | ||
| intron_start = tRNA_start - intron_start | ||
| intron_stop = tRNA_start - intron_stop + 1 # needed for python 0 indexing and correct slicing of intron | ||
| else: # tRNA is on forward strand | ||
| intron_count += 1 | ||
| intron_start -= tRNA_start | ||
| intron_stop -= tRNA_start | ||
| intron_stop += 1 # python 0 indexing | ||
| Intron_dict[tRNA_ID] = {} | ||
| Intron_dict[tRNA_ID]['intron_start'] = intron_start | ||
| Intron_dict[tRNA_ID]['intron_stop'] = intron_stop | ||
| log.info("{} introns registered...".format(intron_count)) | ||
| return(Intron_dict) | ||
| def intronRemover (Intron_dict, seqIO_dict, seqIO_record, posttrans_mod_off, double_cca): | ||
| # Use Intron_dict to find and remove introns plus add CCA and 5' G for His (if eukaryotic) | ||
| # Find a match, slice intron and add G and CCA | ||
| ID = re.search("tRNAscan-SE ID: (.*?)\).|\((chr.*?)-",seqIO_dict[seqIO_record].description).groups() | ||
| ID = list(filter(None, ID))[0] | ||
| if ID in Intron_dict: | ||
| seq = str(seqIO_dict[seqIO_record].seq[:Intron_dict[ID]['intron_start']] + seqIO_dict[seqIO_record].seq[Intron_dict[ID]['intron_stop']:]) | ||
| else: | ||
| seq = str(seqIO_dict[seqIO_record].seq) | ||
| if posttrans_mod_off == False: | ||
| if double_cca: | ||
| seq = seq + 'CCACCA' | ||
| else: | ||
| seq = seq + 'CCA' | ||
| if 'His' in seqIO_record: | ||
| seq = 'G' + seq | ||
| return(seq) | ||
| def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, experiment_name, out_dir, double_cca, threads, snp_tolerance = False, cluster = False, cluster_id = 0.95, posttrans_mod_off = False, pretrnas = False, local_mod = False, search='usearch'): | ||
| # Builds SNP index needed for GSNAP based on modificaiton data for each tRNA and clusters tRNAs | ||
@@ -256,3 +377,3 @@ | ||
| # generate modomics_dict and tRNA_dict | ||
| tRNA_dict, modomics_dict, species = tRNAparser(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod) | ||
| tRNA_dict, modomics_dict, species = tRNAparser(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, posttrans_mod_off, double_cca, pretrnas, local_mod) | ||
| temp_dir = out_dir + "/tmp/" | ||
@@ -910,89 +1031,2 @@ | ||
| def modificationParser(modifications_table): | ||
| # Read in modifications and build dictionary | ||
| mods = open(modifications_table, 'r', encoding='utf-8') | ||
| modifications = {} | ||
| for line in mods: | ||
| if not line.startswith("#"): | ||
| name, abbr, ref, mod = line.split('\t') | ||
| # replace unknown modifications with reference of N | ||
| if not ref or ref.isspace(): | ||
| ref = 'N' | ||
| if mod and not mod.isspace(): | ||
| modifications[mod.strip()] = {'name':name.strip(), 'abbr':abbr.strip(), 'ref':ref.strip()} | ||
| return(modifications) | ||
| def getUnmodSeq(seq, modification_table): | ||
| # Change modified bases into standard ACGT in input sequence | ||
| new_seq = [] | ||
| for char in seq: | ||
| # for insertions ('_') make reference N - this is not described in the modifications table | ||
| if char == '_': | ||
| char = 'N' | ||
| else: | ||
| char = modification_table[char]['ref'] | ||
| # Change queuosine to G (reference is preQ0base in modification file) | ||
| if char == 'preQ0base': | ||
| char = 'G' | ||
| new_seq.append(char) | ||
| new_seq = ''.join(new_seq) | ||
| new_seq = new_seq.replace('U','T') | ||
| return(new_seq) | ||
| def initIntronDict(tRNAscan_out): | ||
| # Build dictionary of intron locations | ||
| Intron_dict = {} | ||
| tRNAscan = open(tRNAscan_out, 'r') | ||
| intron_count = 0 | ||
| for line in tRNAscan: | ||
| if not line.startswith(("Sequence", "Name", "-")): | ||
| tRNA_ID = line.split()[0] + ".trna" + line.split()[1] | ||
| tRNA_start = int(line.split()[2]) | ||
| intron_start = int(line.split()[6]) | ||
| intron_stop = int(line.split()[7]) | ||
| # if inton boundaries are not 0, i.e. there is an intron then add to dict | ||
| if (intron_start > 0) & (intron_stop > 0): | ||
| if tRNA_start > intron_start: # tRNA is on reverse strand | ||
| intron_count += 1 | ||
| intron_start = tRNA_start - intron_start | ||
| intron_stop = tRNA_start - intron_stop + 1 # needed for python 0 indexing and correct slicing of intron | ||
| else: # tRNA is on forward strand | ||
| intron_count += 1 | ||
| intron_start -= tRNA_start | ||
| intron_stop -= tRNA_start | ||
| intron_stop += 1 # python 0 indexing | ||
| Intron_dict[tRNA_ID] = {} | ||
| Intron_dict[tRNA_ID]['intron_start'] = intron_start | ||
| Intron_dict[tRNA_ID]['intron_stop'] = intron_stop | ||
| log.info("{} introns registered...".format(intron_count)) | ||
| return(Intron_dict) | ||
| def intronRemover (Intron_dict, seqIO_dict, seqIO_record, posttrans_mod_off, double_cca): | ||
| # Use Intron_dict to find and remove introns plus add CCA and 5' G for His (if eukaryotic) | ||
| # Find a match, slice intron and add G and CCA | ||
| ID = re.search("tRNAscan-SE ID: (.*?)\).|\((chr.*?)-",seqIO_dict[seqIO_record].description).groups() | ||
| ID = list(filter(None, ID))[0] | ||
| if ID in Intron_dict: | ||
| seq = str(seqIO_dict[seqIO_record].seq[:Intron_dict[ID]['intron_start']] + seqIO_dict[seqIO_record].seq[Intron_dict[ID]['intron_stop']:]) | ||
| else: | ||
| seq = str(seqIO_dict[seqIO_record].seq) | ||
| if posttrans_mod_off == False: | ||
| if double_cca: | ||
| seq = seq + 'CCACCA' | ||
| else: | ||
| seq = seq + 'CCA' | ||
| if 'His' in seqIO_record: | ||
| seq = 'G' + seq | ||
| return(seq) | ||
| def countsAnticodon(input_counts, out_dir): | ||
@@ -999,0 +1033,0 @@ # Counts per anticodon |
@@ -1,1 +0,1 @@ | ||
| __version__ = "v1.3.7" | ||
| __version__ = "v1.3.8" |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: mimseq | ||
| Version: 1.3.7 | ||
| Version: 1.3.8 | ||
| Summary: Custom high-throughput tRNA sequencing alignment and quantification pipeline based on modification induced misincorporation cDNA synthesis. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/nedialkova-lab/mim-tRNAseq |
+2
-4
@@ -39,8 +39,6 @@ <p align="center"> | ||
| To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time and dependency-related improvements can be made to using conda for managing environment and installing mimseq using the [Mambaforge](https://github.com/conda-forge/miniforge) version of conda Miniforge. We recommend installing Mambaforge and then followin the steps below: | ||
| To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time and dependency-related improvements can be made to using conda for managing environment and installing mimseq using the [Miniforge](https://github.com/conda-forge/miniforge) version of conda which oncludes optional use for Mamba. We recommend installing Miniforge and then following the steps below: | ||
| ```bash | ||
| conda create -n mimseq python=3.7 | ||
| conda activate mimseq | ||
| conda config --add channels conda-forge | ||
| conda install -c conda-forge mamba | ||
| mamba install -c bioconda mimseq | ||
@@ -62,3 +60,3 @@ ``` | ||
| Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (see documentation) will also need to be installed. | ||
| Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (including `usearch` as above, `BLAST`, `infernal`, `GMAP/GSNAP`, and all required R packages) will also need to be installed manually. | ||
| ```bash | ||
@@ -65,0 +63,0 @@ pip install mimseq |
+1
-0
@@ -51,2 +51,3 @@ #!/usr/bin/env python | ||
| "pybedtools", | ||
| "requests", | ||
| "statsmodels"], | ||
@@ -53,0 +54,0 @@ classifiers=[ |
Sorry, the diff of this file is too big to display
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
3610
0.78%17723397
-0.62%174
-0.57%