addext
Advanced tools
| #!/usr/bin/env python3 | ||
| """ | ||
| Save selected information from PRONOM exports | ||
| to JSON file, using PUID as key | ||
| PRONOM exports available from Ross Spencer: | ||
| https://github.com/exponential-decay/ | ||
| pronom-archive-and-skeleton-test-suite | ||
| Positional args: | ||
| pronom_export: Path to directory containing PRONOM XML exports | ||
| json_path: Path for new JSON file | ||
| """ | ||
| import json | ||
| from lxml import etree, objectify | ||
| import os | ||
| import sys | ||
| def main(): | ||
| # Save abspaths for args | ||
| pronom_export = os.path.abspath(sys.argv[1]) | ||
| json_path = os.path.abspath(sys.argv[2]) | ||
| # Create dict to store data | ||
| puids = dict() | ||
| # Walk pronom_exports and parse XML files | ||
| for root_dir, _, files in os.walk(pronom_export): | ||
| for file_ in files: | ||
| # Save filepath | ||
| file_path = os.path.join(root_dir, file_) | ||
| # Skip file if not XML | ||
| if not file_path.lower().endswith('xml'): | ||
| continue | ||
| # Open XML file and strip namespaces | ||
| tree = etree.parse(file_path) | ||
| root = tree.getroot() | ||
| for elem in root.getiterator(): | ||
| if not hasattr(elem.tag, 'find'): | ||
| continue | ||
| i = elem.tag.find('}') | ||
| if i >= 0: | ||
| elem.tag = elem.tag[i + 1:] | ||
| objectify.deannotate(root, cleanup_namespaces=True) | ||
| # Create dict to save format information | ||
| format_info = dict() | ||
| puid = '' | ||
| file_extensions = list() | ||
| # Parse XML | ||
| for target in root.findall('.//FileFormat'): | ||
| # Save format and version to format info dict | ||
| format_info['file_format'] = target.find('FormatName').text | ||
| format_info['version'] = target.find('FormatVersion').text.strip() | ||
| # Save PUID to variable | ||
| for target1 in target.findall('.//FileFormatIdentifier'): | ||
| id_type = target1.find('IdentifierType').text | ||
| if id_type == 'PUID': | ||
| puid = target1.find('Identifier').text | ||
| # Save file extensions to list | ||
| for target2 in target.findall('.//ExternalSignature'): | ||
| signature_type = target2.find('SignatureType').text | ||
| if signature_type == 'File extension': | ||
| file_extensions.append(target2.find('Signature').text) | ||
| # Add file extensions list to format info dict | ||
| format_info['file_extensions'] = file_extensions | ||
| # Add to dict with PUID as key | ||
| puids[puid] = format_info | ||
| # Write dict to file as JSON | ||
| with open(json_path, 'w') as f: | ||
| json.dump(puids, f, indent=2) | ||
| if __name__ == '__main__': | ||
| main() |
+95
| ## addext | ||
| ### Version: 2.0.0 | ||
| [](https://travis-ci.org/timothyryanwalsh/addext) | ||
| Python script to add file extensions to files without them, based on Siegfried identification of PUID. | ||
| ### Calling addext | ||
| `addext.py` takes two positional arguments: | ||
| * `target`: Path to target file or directory | ||
| * `json`: Path to addext PRONOM JSON file (`pronom_v95.json` is included in this repository for convenience. See **PRONOM JSON file** section below for instructions on how to create a new JSON file in expected format from PRONOM XML exports) | ||
| Options include: | ||
| * `-d, --dryrun`: Perform dry run (print would-be changes to terminal instead of renaming files) | ||
| * `-m, --manual`: Manually choose extension to add to files when PRONOM gives several options (not available in Windows) | ||
| ### Behavior | ||
| #### Default mode | ||
| In its default mode, `addext` adds file extensions to files if they meet a few conditions: | ||
| * Siegfried can positively identify a PUID for the file | ||
| * There is at least one file extension associated with the PUID in PRONOM | ||
| * The file does not already have one of the extensions listed in PRONOM for that PUID (case-insensitive) | ||
| If all conditions are met, `addext` adds the file extension to the file in-place. It is recommended that you try a dry run first to evaluate the proposed changes before renaming files. | ||
| #### Manual mode | ||
| In `-m, --manual` mode, `addext` follows the following logic: | ||
| * If Siegfried cannot positively identify a PUID for the file, skip the file | ||
| * If there is only one file extension associated with the PUID in PRONOM and the file does not already have this extension (case-insensitive), add the extension | ||
| * If there is more than one file extension associated with the PUID in PRONOM and the file does not already have this extension, allow the user to choose which extension to add and then modify the filename in-place | ||
| Note that for directories with many files, going through the files one-by-one in manual mode may take some time. Running `addext` as a dry run in manual mode may help give an idea of the extent of manual choices you will be asked to make. | ||
| Due to its dependency on [Inquirer](https://github.com/magmax/python-inquirer), manual mode is not available on Windows. | ||
| ### Requirements | ||
| * Python 3.6+ | ||
| * [Siegfried](https://github.com/richardlehane/siegfried) | ||
| * [Inquirer](https://github.com/magmax/python-inquirer): For selection between extension options in `-m, --manual` mode (Linux/macOS only); installed with `pip install inquirer` | ||
| ### Installation | ||
| #### Install Siegfried | ||
| Install Siegfried following the instructions found [here](https://github.com/richardlehane/siegfried). | ||
| #### Install via git clone/download | ||
| The easiest way to use `addext` is to clone or download this repository and then run the script with `python3 /path/to/addext.py [options]`. | ||
| If taking this route, install additional Python library dependencies: `pip install -r requirements.txt` or `pip install inquirer` (this may require sudo permissions). | ||
| #### Install via PyPI | ||
| `addext` can also be installed via `pip install addext`. This will install a script in the `/usr/local/bin` directory (assuming a Linux/macOS installation) so that `addext` can be called from anywhere with simply `addext.py [options]`. | ||
| Note that following installation, you will need to download or create a PRONOM JSON file to use with `addext`. | ||
| ### PRONOM JSON file | ||
| #### Description | ||
| The PRONOM JSON file is a lightweight representation of information from PRONOM needed for addext to function. The file contains an object for each format described with a PRONOM ID (PUID), structured like the following example: | ||
| ``` | ||
| "fmt/858": { | ||
| "file_format": "Navisworks Document", | ||
| "version": "2010", | ||
| "file_extensions": [ | ||
| "nwd", | ||
| "nwc" | ||
| ] | ||
| } | ||
| ``` | ||
| #### Updating the PRONOM JSON file | ||
| `pronom_v95.json` is currently up-to-date with PRONOM release v95. | ||
| To create a new PRONOM JSON file (for instance, after a new PRONOM release): | ||
| * Get PRONOM XML export from Ross Spencer's [Release repository for The Skeleton Test Suite](https://github.com/exponential-decay/pronom-archive-and-skeleton-test-suite), which provides a set of DOIs for archives of PRONOM releases. | ||
| * Run `addext/pronom_xml_to_json.py` to create a new PRONOM JSON file from the XML exports: `python3 pronom_xml_to_json.py /path/to/pronom/export/directory pronom.json` | ||
| ### Creators | ||
| * Canadian Centre for Architecture | ||
| * Tim Walsh | ||
| This project was initially developed in 2016-2017 for the [Canadian Centre for Architecture](https://www.cca.qc.ca) by Tim Walsh, Digital Archivist, as part of the development of the Archaeology of the Digital project. |
| Metadata-Version: 1.1 | ||
| Name: addext | ||
| Version: 1.0.1 | ||
| Summary: Adds file extensions to files based on their PRONOM identifiers (PUIDs). | ||
| Version: 2.0.0 | ||
| Summary: Adds file extensions based on PRONOM ID | ||
| Home-page: https://github.com/timothyryanwalsh/addext | ||
@@ -22,6 +22,6 @@ Author: Tim Walsh | ||
| Classifier: Operating System :: Microsoft :: Windows | ||
| Classifier: Programming Language :: Python :: 2.7 | ||
| Classifier: Programming Language :: Python :: 3.5 | ||
| Classifier: Programming Language :: Python :: 3.6 | ||
| Classifier: Programming Language :: Python :: 3.7 | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Classifier: Topic :: System :: Filesystems | ||
| Classifier: Topic :: Utilities |
@@ -1,7 +0,6 @@ | ||
| MANIFEST.in | ||
| README.md | ||
| setup.py | ||
| addext/__init__.py | ||
| addext/addext.py | ||
| addext/pronom-xml-to-sqlite.py | ||
| addext/pronom.db | ||
| addext/pronom_xml_to_json.py | ||
| addext.egg-info/PKG-INFO | ||
@@ -8,0 +7,0 @@ addext.egg-info/SOURCES.txt |
+185
-189
@@ -1,223 +0,219 @@ | ||
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Adds file extensions to files based on their PUIDs. | ||
| addext | ||
| ------ | ||
| CLI utility to add file extensions to files without them based on PRONOM ID | ||
| Tim Walsh | ||
| November 2017 | ||
| Script has three modes: | ||
| * Default: Adds first file extension associated with PUID in PRONOM | ||
| * Dry run: Preview changes from Defualt mode without making any changes | ||
| to the files | ||
| * Manual: Manually choose extension to add to files when PRONOM gives several | ||
| options (Linux/macOS only) | ||
| Requires Siegfried and inquirer. See README for installation instructions | ||
| """ | ||
| import argparse | ||
| import csv | ||
| import inquirer | ||
| import logging | ||
| import json | ||
| import os | ||
| import shutil | ||
| import sqlite3 | ||
| import subprocess | ||
| import sys | ||
| import tempfile | ||
| try: | ||
| # python3 | ||
| from urllib.request import urlopen | ||
| except ImportError: | ||
| # fall back to python 2's urllib2 | ||
| from urllib2 import urlopen | ||
| def _make_parser(): | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument("-d", "--dryrun", | ||
| help="Perform dry run: print would-be changes to terminal", | ||
| action="store_true") | ||
| parser.add_argument("-m", "--manual", | ||
| help="Manually choose extension to add to files when PRONOM gives several options (not available in Windows)", | ||
| action="store_true") | ||
| parser.add_argument("--droid_csv", | ||
| help="Path to DROID CSV (created by DROID or Siegfried) for files", | ||
| action="store") | ||
| parser.add_argument("file", | ||
| help="Path to file or files where extensions will be added") | ||
| parser.add_argument( | ||
| "-d", | ||
| "--dryrun", | ||
| help="Perform dry run: print would-be changes to terminal", | ||
| action="store_true", | ||
| ) | ||
| parser.add_argument( | ||
| "-m", | ||
| "--manual", | ||
| help="Manually choose extension when multiple options (Linux/macOS)", | ||
| action="store_true", | ||
| ) | ||
| parser.add_argument("target", help="Path to target file or directory") | ||
| parser.add_argument("json", help="Path to PRONOM JSON file") | ||
| return parser | ||
| def download_pronom_db(): | ||
| def _configure_logging(): | ||
| """ | ||
| Download pronom.db from Github to script directory. | ||
| Configure logging to write to logfile created in | ||
| user's current directory and to stdout | ||
| """ | ||
| print("Addext could not find pronom.db file in script directory.") | ||
| print("Downloading file now. This should only be necessary once.") | ||
| logging.basicConfig( | ||
| level=logging.INFO, | ||
| format="%(asctime)s - %(levelname)s - %(message)s", | ||
| handlers=[ | ||
| logging.FileHandler("addext.log"), | ||
| logging.StreamHandler(sys.stdout) | ||
| ], | ||
| ) | ||
| logger = logging.getLogger() | ||
| return logger | ||
| # url for pronom.db | ||
| url = "https://github.com/timothyryanwalsh/addext/blob/master/addext/pronom.db?raw=true" | ||
| # download file to current directory | ||
| file_name = "pronom.db" | ||
| u = urlopen(url) | ||
| f = open(file_name, 'wb') | ||
| block_sz = 8192 | ||
| while True: | ||
| buffer = u.read(block_sz) | ||
| if not buffer: | ||
| break | ||
| f.write(buffer) | ||
| f.close() | ||
| def _puid_or_none(sf_matches): | ||
| """ | ||
| From input list of dictionaries describing Siegfried | ||
| matches for given file, return PUID or None | ||
| """ | ||
| puid = None | ||
| for match in sf_matches: | ||
| if match["ns"] == "pronom": | ||
| puid = match["id"] | ||
| return puid | ||
| # check that file was successfully downloaded | ||
| if os.path.isfile(file_name) and os.path.getsize(file_name) > 0: | ||
| print("File successfully downloaded.") | ||
| def _check_file_extension(filepath, extensions): | ||
| """ | ||
| Return True if file extension (case-insensitive) | ||
| is present in list, and False if not | ||
| """ | ||
| # Get lower-cased file extension from path | ||
| _, file_extension = os.path.splitext(filepath) | ||
| file_extension_lower = file_extension[1:].lower() | ||
| # Make lower-cased list | ||
| extensions_lower = list() | ||
| for item in extensions: | ||
| extensions_lower.append(item.lower()) | ||
| # Check equivalency | ||
| if file_extension_lower in extensions_lower: | ||
| return True | ||
| else: | ||
| print("Error downloading database. Check permissions in script directory.") | ||
| sys.exit(69) | ||
| return False | ||
| def _rename_file(filepath, new_file, new_filepath, logger): | ||
| """ | ||
| Rename file in place and log OSErrors | ||
| """ | ||
| try: | ||
| os.rename(filepath, new_filepath) | ||
| logger.info(f"{filepath} renamed -> {new_file}") | ||
| except OSError as e: | ||
| logger.error(f"Unable to rename {filepath}. Details: {e}") | ||
| def _process_file(root, filepath, pronom_data, args, logger): | ||
| """ | ||
| Identify and rename file, respecting user args | ||
| """ | ||
| file_ = os.path.basename(filepath) | ||
| # Attempt to determine PUID with Siegfried | ||
| cmd = ["sf", "-json", filepath] | ||
| try: | ||
| sf_json = subprocess.check_output(cmd) | ||
| except subprocess.CalledProcessError as e: | ||
| logger.error("Unable to call Siegfried. Is it installed and on path?") | ||
| sys.exit(1) | ||
| sf_data = json.loads(sf_json) | ||
| puid = _puid_or_none(sf_data["files"][0]["matches"]) | ||
| # Return if unidentified | ||
| if not puid: | ||
| logger.info(f"Skipping {filepath} - format not identifiable") | ||
| return | ||
| # Save file format | ||
| file_format = pronom_data[puid]["file_format"] | ||
| # Return if already has one of extensions listed in PRONOM | ||
| extensions = pronom_data[puid]["file_extensions"] | ||
| extension_in_place = _check_file_extension(filepath, extensions) | ||
| if extension_in_place: | ||
| logger.info( | ||
| f"Skipping {filepath} - already has correct extension for {file_format} ({puid})" | ||
| ) | ||
| return | ||
| # Return if no extensions listed for format in PRONOM | ||
| if not extensions: | ||
| logger.info( | ||
| f"Skipping {filepath} - no extensions listed in PRONOM for {file_format} ({puid})" | ||
| ) | ||
| return | ||
| # If manual mode and > 1 extension available, prompt for user input | ||
| if args.manual and len(extensions) > 1: | ||
| # Log all known extensions | ||
| extensions_str = ", ".join([x for x in extensions]) | ||
| logger.info( | ||
| f"{filepath} identified as {file_format} ({puid}). Possible extensions: {extensions_str}" | ||
| ) | ||
| # If --dryrun, return | ||
| if args.dryrun: | ||
| return | ||
| # Otherwise, prompt user for extension and rename file in place | ||
| else: | ||
| # Use Inquirer to let user choose from list | ||
| questions = [ | ||
| inquirer.List( | ||
| "extension", | ||
| message="Which extension would you like to add?", | ||
| choices=extensions, | ||
| ) | ||
| ] | ||
| # Get chosen extension | ||
| answers = inquirer.prompt(questions) | ||
| extension_to_add = answers["extension"] | ||
| # Rename file | ||
| new_file = f"{file_}.{extension_to_add}" | ||
| new_filepath = os.path.join(root, new_file) | ||
| _rename_file(filepath, new_file, new_filepath, logger) | ||
| return | ||
| # If default (auto) mode or only 1 extension, use first extension | ||
| extension_to_add = extensions[0] | ||
| new_file = f"{file_}.{extension_to_add}" | ||
| new_filepath = os.path.join(root, new_file) | ||
| # If --dryrun, log change to make and return | ||
| if args.dryrun: | ||
| logger.info( | ||
| f"{filepath} identified as {file_format} ({puid}). Rename {file_} -> {new_file}" | ||
| ) | ||
| return | ||
| # Otherwise, rename file in place | ||
| _rename_file(filepath, new_file, new_filepath, logger) | ||
| def main(): | ||
| # parse arguments | ||
| # Parse arguments | ||
| parser = _make_parser() | ||
| args = parser.parse_args() | ||
| source = os.path.abspath(args.file) | ||
| # Store fs references as abspaths | ||
| target = os.path.abspath(args.target) | ||
| pronom_json = os.path.abspath(args.json) | ||
| # connect to pronom.db | ||
| THIS_DIR = os.path.dirname(os.path.realpath(__file__)) | ||
| db = os.path.join(THIS_DIR, 'pronom.db') | ||
| # download copy of pronom.db if not in same directory as script | ||
| if not os.path.isfile(db): | ||
| download_pronom_db() | ||
| try: | ||
| conn = sqlite3.connect(db) | ||
| conn.text_factory = str # allows utf-8 data to be stored | ||
| cursor = conn.cursor() | ||
| except: | ||
| print("Error connecting to pronom.db database. Shutting down.") | ||
| sys.exit(69) | ||
| # Configure logging | ||
| logger = _configure_logging() | ||
| # create DROID CSV if user didn't pass one to script | ||
| if args.droid_csv: | ||
| droid_csv = os.path.abspath(args.droid_csv) | ||
| else: | ||
| # create tempdir for droid csv | ||
| tmpdir = tempfile.mkdtemp() | ||
| tmpdir_path = os.path.abspath(tmpdir) | ||
| droid_csv = os.path.join(tmpdir_path, 'droid.csv') | ||
| # create droid csv with siegfried | ||
| subprocess.call("sf -droid '%s' > '%s'" % (source, droid_csv), shell=True) | ||
| # Load PRONOM JSON as dictionary | ||
| with open(pronom_json, "r") as f: | ||
| pronom_data = json.load(f) | ||
| # loop through files | ||
| for rt, dirs, files in os.walk(source): | ||
| for f in files: | ||
| filepath = os.path.join(rt, f) | ||
| puid = '' | ||
| # search DROID CSV for path, get PUID | ||
| with open(droid_csv) as droid: | ||
| r = csv.reader(droid) | ||
| for row in r: | ||
| if row[3] == filepath: | ||
| puid = row[14] | ||
| fileformat = row[16] | ||
| # if PUID found, carry on | ||
| if puid != '': | ||
| # if manual, give option to user whenever > 1 possible extension is found | ||
| if args.manual: | ||
| # get list of possible extensions using puid | ||
| sql = "SELECT id from puids WHERE puid='%s';" % (puid) | ||
| cursor.execute(sql) | ||
| pk = cursor.fetchone()[0] | ||
| sql = "SELECT extension from extensions WHERE puid='%s';" % (pk) | ||
| cursor.execute(sql) | ||
| file_ext_list = [item[0] for item in cursor.fetchall()] | ||
| # if >= 1 extension found, carry on | ||
| if file_ext_list: | ||
| # check if dry run - if so, print results to terminal | ||
| if args.dryrun == True: | ||
| print("File %s is format %s (%s). Possible extensions: %s" % (filepath, fileformat, puid, ', '.join(map(str, file_ext_list)))) | ||
| else: | ||
| # if only one possible extension, just add it and report to user | ||
| if len(file_ext_list) == 1: | ||
| # append filename to file in-place | ||
| file_ext = "." + file_ext_list[0] | ||
| new_filepath = filepath + file_ext | ||
| new_filename = f + file_ext | ||
| # check if file already ends in correct extension before adding | ||
| if not filepath.lower().endswith(file_ext): | ||
| try: | ||
| os.rename(filepath, new_filepath) | ||
| print("File " + filepath + " only has one possible extension. Renamed to " + new_filename) | ||
| except OSError as err: | ||
| print("Error renaming file " + filepath + ": ", err) | ||
| else: | ||
| print("File " + filepath + " already has correct extension. Skipping file.") | ||
| # if > 1 extension, give control to user | ||
| else: | ||
| # get user input | ||
| if (sys.version_info > (3, 0)): | ||
| choice = input("File %s is format %s (%s). Possible extensions: %s. Add an extension? (y/n)" % (filepath, fileformat, puid, ', '.join(map(str, file_ext_list)))) | ||
| else: | ||
| choice = raw_input("File %s is format %s (%s). Possible extensions: %s. Add an extension? (y/n)" % (filepath, fileformat, puid, ', '.join(map(str, file_ext_list)))) | ||
| # if input is yes, display options and apply change | ||
| if choice.lower() in ['yes', 'y']: | ||
| # use Inquirer to let user choose from list | ||
| questions = [ | ||
| inquirer.List('extension', | ||
| message="Which extension would you like to add?", | ||
| choices=file_ext_list, | ||
| ), | ||
| ] | ||
| # get chosen extension | ||
| answers = inquirer.prompt(questions) | ||
| file_ext = "." + answers['extension'] | ||
| # append filename to file in-place | ||
| new_filepath = filepath + file_ext | ||
| new_filename = f + file_ext | ||
| try: | ||
| os.rename(filepath, new_filepath) | ||
| print("File " + filepath + " renamed to " + new_filename) | ||
| except OSError as err: | ||
| print("Error renaming file " + filepath + ": ", err) | ||
| else: | ||
| print("File " + filepath + " skipped.") | ||
| # Check if target is file | ||
| if os.path.isfile(target): | ||
| root = os.path.split(target)[0] | ||
| _process_file(root, target, pronom_data, args, logger) | ||
| return | ||
| else: | ||
| print("File " + filepath + " identified as " + puid + ". No extensions are registered in PRONOM for this PUID. Skipping file.") | ||
| # else, use default extension (first listed in PRONOM for PUID) | ||
| else: | ||
| sql = "SELECT default_extension from puids WHERE puid='%s';" % (puid) | ||
| cursor.execute(sql) | ||
| file_ext = cursor.fetchone()[0] | ||
| if file_ext: | ||
| new_filepath = filepath + "." + file_ext # filename + extension | ||
| new_filename = f + "." + file_ext # new filename without path | ||
| # check if dry run - if so, print results to stdout | ||
| if args.dryrun == True: | ||
| if not filepath.lower().endswith(file_ext): | ||
| print("File %s is format %s (%s). Rename %s -> %s" % (filepath, fileformat, puid, f, new_filename)) | ||
| else: | ||
| print("File " + filepath + " already has correct extension. Skipping file.") | ||
| else: | ||
| # check if file already ends in correct extension before adding | ||
| if not filepath.lower().endswith(file_ext): | ||
| try: | ||
| os.rename(filepath, new_filepath) | ||
| print("File " + filepath + " renamed to " + new_filename) | ||
| except OSError as err: | ||
| print("Error renaming file " + filepath + ": ", err) | ||
| else: | ||
| print("File " + filepath + " already has correct extension. Skipping file.") | ||
| else: | ||
| print("File " + filepath + " identified as " + puid + ". No extensions are registered in PRONOM for this PUID. Skipping file.") | ||
| else: | ||
| print("File " + filepath + " not identified. Skipping file.") | ||
| # If target is dir, walk recursively | ||
| for root, _, files in os.walk(target): | ||
| for file_ in files: | ||
| filepath = os.path.join(root, file_) | ||
| _process_file(root, filepath, pronom_data, args, logger) | ||
| # delete DROID tempdir if applicable | ||
| if not args.droid_csv: | ||
| shutil.rmtree(tmpdir_path) | ||
| # close db, print finished message | ||
| conn.commit() | ||
| conn.close() | ||
| print("Process complete.") | ||
| if __name__ == '__main__': | ||
| main() | ||
| if __name__ == "__main__": | ||
| main() |
+4
-4
| Metadata-Version: 1.1 | ||
| Name: addext | ||
| Version: 1.0.1 | ||
| Summary: Adds file extensions to files based on their PRONOM identifiers (PUIDs). | ||
| Version: 2.0.0 | ||
| Summary: Adds file extensions based on PRONOM ID | ||
| Home-page: https://github.com/timothyryanwalsh/addext | ||
@@ -22,6 +22,6 @@ Author: Tim Walsh | ||
| Classifier: Operating System :: Microsoft :: Windows | ||
| Classifier: Programming Language :: Python :: 2.7 | ||
| Classifier: Programming Language :: Python :: 3.5 | ||
| Classifier: Programming Language :: Python :: 3.6 | ||
| Classifier: Programming Language :: Python :: 3.7 | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Classifier: Topic :: System :: Filesystems | ||
| Classifier: Topic :: Utilities |
+0
-1
| [egg_info] | ||
| tag_build = | ||
| tag_date = 0 | ||
| tag_svn_revision = 0 | ||
+16
-20
| from setuptools import setup | ||
| setup( | ||
| name = 'addext', | ||
| version = '1.0.1', | ||
| url = 'https://github.com/timothyryanwalsh/addext', | ||
| author = 'Tim Walsh', | ||
| author_email = 'timothyryanwalsh@gmail.com', | ||
| name='addext', | ||
| version='2.0.0', | ||
| url='https://github.com/timothyryanwalsh/addext', | ||
| author='Tim Walsh', | ||
| author_email='timothyryanwalsh@gmail.com', | ||
| packages=['addext'], | ||
| package_data={ | ||
| 'addext': ['pronom.db'] | ||
| }, | ||
| include_package_data=True, | ||
| py_modules = ['addext'], | ||
| scripts = ['addext/addext.py'], | ||
| install_requires = ['inquirer'], | ||
| description = 'Adds file extensions to files based on their PRONOM identifiers (PUIDs).', | ||
| keywords = 'extensions identification', | ||
| platforms = ['POSIX', 'Windows'], | ||
| classifiers = [ | ||
| py_modules=['addext'], | ||
| scripts=['addext/addext.py'], | ||
| install_requires=['inquirer'], | ||
| description='Adds file extensions based on PRONOM ID', | ||
| keywords='extensions identification', | ||
| platforms=['POSIX', 'Windows'], | ||
| classifiers=[ | ||
| 'Development Status :: 4 - Beta', | ||
@@ -25,3 +21,3 @@ 'License :: OSI Approved :: MIT License', | ||
| 'Intended Audience :: Developers', | ||
| 'Natural Language :: English', | ||
| 'Natural Language :: English', | ||
| 'Operating System :: MacOS', | ||
@@ -31,8 +27,8 @@ 'Operating System :: MacOS :: MacOS X', | ||
| 'Operating System :: Microsoft :: Windows', | ||
| 'Programming Language :: Python :: 2.7', | ||
| 'Programming Language :: Python :: 3.5', | ||
| 'Programming Language :: Python :: 3.6', | ||
| 'Programming Language :: Python :: 3.7', | ||
| 'Programming Language :: Python :: 3.8', | ||
| 'Topic :: System :: Filesystems', | ||
| 'Topic :: Utilities' | ||
| ], | ||
| ) | ||
| ) |
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| """ | ||
| Crawls XML output from Ross Spencer's pronom-xml-export | ||
| (https://github.com/exponential-decay/pronom-xml-export) | ||
| and writes selected info into a sqlite db. | ||
| Tim Walsh | ||
| November 2017 | ||
| """ | ||
| import argparse | ||
| import os | ||
| from lxml import etree, objectify | ||
| import sqlite3 | ||
| def _make_parser(): | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument("source", | ||
| help="Path of PRONOM XML export directory") | ||
| parser.add_argument("destination", | ||
| help="Path of directory to write sqlite db") | ||
| return parser | ||
| def main(): | ||
| # parse arguments | ||
| parser = _make_parser() | ||
| args = parser.parse_args() | ||
| # make abspaths for source and dest dirs | ||
| source = os.path.abspath(args.source) | ||
| dest = os.path.abspath(args.destination) | ||
| # create sqlite db | ||
| db = os.path.join(dest, 'pronom.db') | ||
| conn = sqlite3.connect(db) | ||
| conn.text_factory = str # allows utf-8 data to be stored | ||
| cursor = conn.cursor() | ||
| # create db tables | ||
| cursor.execute("DROP TABLE IF EXISTS puids") | ||
| cursor.execute("DROP TABLE IF EXISTS extensions") | ||
| cursor.execute("CREATE TABLE puids (id integer PRIMARY KEY AUTOINCREMENT, puid text, fileformat text, version text, default_extension text);") | ||
| cursor.execute("CREATE TABLE extensions (id integer PRIMARY KEY AUTOINCREMENT, extension text, puid text, FOREIGN KEY (puid) REFERENCES puids(id));") | ||
| # for PUID XML file in export, write info to dict, then db | ||
| for rt, dirs, files in os.walk(source): | ||
| for name in files: | ||
| file_path = os.path.join(rt, name) | ||
| # skip if not xml | ||
| if not file_path.lower().endswith('xml'): | ||
| continue | ||
| # open xml file and strip namespaces | ||
| tree = etree.parse(file_path) | ||
| root = tree.getroot() | ||
| for elem in root.getiterator(): | ||
| if not hasattr(elem.tag, 'find'): continue # (1) | ||
| i = elem.tag.find('}') | ||
| if i >= 0: | ||
| elem.tag = elem.tag[i+1:] | ||
| objectify.deannotate(root, cleanup_namespaces=True) | ||
| # create dict for PUID | ||
| puid = dict() | ||
| # parse xml | ||
| for target in root.findall(".//FileFormat"): | ||
| # add basic info to dict | ||
| puid['name'] = target.find("FormatName").text | ||
| puid['version'] = target.find("FormatVersion").text.strip() | ||
| # add identifiers to dict | ||
| identifiers = list() | ||
| for target1 in target.findall(".//FileFormatIdentifier"): | ||
| id_dict = dict() | ||
| id_dict['identifier'] = target1.find("Identifier").text | ||
| id_dict['id_type'] = target1.find("IdentifierType").text | ||
| identifiers.append(id_dict) | ||
| puid['identifiers'] = identifiers | ||
| # add external signatures to dict | ||
| ext_sigs = list() | ||
| for target2 in target.findall(".//ExternalSignature"): | ||
| sig_dict = dict() | ||
| sig_dict['sig_id'] = target2.find("ExternalSignatureID").text | ||
| sig_dict['signature'] = target2.find("Signature").text | ||
| sig_dict['sig_type'] = target2.find("SignatureType").text | ||
| ext_sigs.append(sig_dict) | ||
| puid['ext_sigs'] = ext_sigs | ||
| # parse info from puid dict | ||
| format_name = puid['name'] | ||
| format_version = puid['version'] | ||
| # always only one PUID | ||
| pronom_ids = [x['identifier'] for x in puid['identifiers'] if x['id_type'] == "PUID"] | ||
| if pronom_ids: | ||
| pronom_id = pronom_ids[0] | ||
| else: | ||
| pronom_id = '' | ||
| # 0 to many extensions - keep all in list and save first value separately | ||
| file_exts = [x['signature'] for x in puid['ext_sigs'] if x['sig_type'] == "File extension"] | ||
| if file_exts: | ||
| default_ext = file_exts[0] | ||
| else: | ||
| default_ext = '' | ||
| # write into db puid table | ||
| cursor.execute("INSERT INTO puids(puid, fileformat, version, default_extension) VALUES (?,?,?,?);", (pronom_id, format_name, format_version, default_ext)) | ||
| puid_pk = cursor.lastrowid # get pk of row written | ||
| # write extensions into extensions table | ||
| for ext in file_exts: | ||
| cursor.execute("INSERT INTO extensions(extension, puid) VALUES (?,?);", (ext, puid_pk)) | ||
| # close db | ||
| conn.commit() | ||
| conn.close() | ||
| if __name__ == '__main__': | ||
| main() |
Sorry, the diff of this file is too big to display
| include addext/pronom.db |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
17377
-88.05%12
-7.69%284
-15.48%