gis-metadata-parser
XML parsers for GIS metadata that are designed to read in, validate, update and output a core set of properties that have been mapped between the most common standards, currently:
- FGDC
- ISO-19139 (and ISO-19115)
- ArcGIS (tested with ArcGIS format 1.0).
This library is compatible with Python versions 2.7 and 3.4 through 3.6.
Installation
Install with pip install gis-metadata-parser
.
Usage
Parsers can be instantiated from files, XML strings or URLs. They can be converted from one standard to another as well.
from gis_metadata.arcgis_metadata_parser import ArcGISParser
from gis_metadata.fgdc_metadata_parser import FgdcParser
from gis_metadata.iso_metadata_parser import IsoParser
from gis_metadata.metadata_parser import get_metadata_parser
with open(r'/path/to/metadata.xml') as metadata:
fgdc_from_file = FgdcParser(metadata)
with open(r'/path/to/metadata.xml') as metadata:
iso_from_file = IsoParser(metadata)
fgdc_from_string = get_metadata_parser(
"""
<?xml version='1.0' encoding='UTF-8'?>
<metadata>
<idinfo>
</idinfo>
</metadata>
"""
)
iso_from_string = get_metadata_parser(
"""
<?xml version='1.0' encoding='UTF-8'?>
<metadata>
<dataIdInfo/></dataIdInfo>
<distInfo/></distInfo>
<dqInfo/></dqInfo>
</metadata>
"""
)
iso_from_string = get_metadata_parser(
"""
<?xml version='1.0' encoding='UTF-8'?>
<MD_Metadata>
<identificationInfo>
</identificationInfo>
</MD_Metadata>
"""
)
fgdc_converted = iso_from_file.convert_to(FgdcParser)
iso_converted = fgdc_from_file.convert_to(IsoParser)
arcgis_converted = iso_converted.convert_to(ArcGISParser)
fgdc_key_vals = fgdc_from_file.convert_to(dict)
iso_key_vals = iso_from_file.convert_to(dict)
Finally, the properties of the parser can be updated, validated, applied and output:
with open(r'/path/to/metadata.xml') as metadata:
fgdc_from_file = FgdcParser(metadata)
fgdc_from_file.title
fgdc_from_file.abstract
fgdc_from_file.place_keywords
fgdc_from_file.thematic_keywords
fgdc_from_file.attributes
fgdc_from_file.bounding_box
fgdc_from_file.contacts
fgdc_from_file.dates
fgdc_from_file.digital_forms
fgdc_from_file.larger_works
fgdc_from_file.process_steps
fgdc_from_file.raster_info
fgdc_from_file.title = 'New Title'
fgdc_from_file.dates = {'type': 'single' 'values': '1/1/2016'}
fgdc_from_file.validate()
fgdc_from_file.serialize()
fgdc_from_file.write()
fgdc_from_file.write(out_file_or_path='/path/to/updated.xml')
Extending and Customizing
Tips
There are a few unwritten (until now) rules about the way the metadata parsers are wired to work:
- Properties are generally defined by XPATH in each
parser._data_map
- Simple parser properties accept only values of
string
and list
's of string
's - XPATH's configured in the data map support references to element attributes:
'path/to/element/@attr'
- Complex parser properties are defined by custom parser/updater functions instead of by XPATH
- Complex parser properties accept values of type
dict
containing simple properties, or a list of said dict
's - XPATH keys in the data map with leading underscores are parsed, but not validated or written out
- XPATH keys in the data map that "shadow" other properties but with a leading underscore serve as secondary values
- Secondary values are used in the absence of a primary value if primary location (element or attribute) is missing
- Additional underscores indicate further locations to check for missing values, i.e.
title
, _title
, __title
Some examples of existing secondary properties are as follows:
ARCGIS_TAG_FORMATS = frozendict({
...
'dist_phone': 'distInfo/distributor/distorCont/rpCntInfo/cntPhone/voiceNum',
'_dist_phone': 'distInfo/distributor/distorCont/rpCntInfo/voiceNum',
...
})
FGDC_DEFINITIONS = dict({k: dict(v) for k, v in iteritems(COMPLEX_DEFINITIONS)})
FGDC_DEFINITIONS[CONTACTS].update({
'_name': '{_name}',
'_organization': '{_organization}'
})
...
class FgdcParser(MetadataParser):
...
def _init_data_map(self):
...
ct_format = FGDC_TAG_FORMATS[CONTACTS]
fgdc_data_structures[CONTACTS] = format_xpaths(
...
name=ct_format.format(ct_path='cntperp/cntper'),
_name=ct_format.format(ct_path='cntorgp/cntper'),
organization=ct_format.format(ct_path='cntperp/cntorg'),
_organization=ct_format.format(ct_path='cntorgp/cntorg'),
)
ISO_DEFINITIONS = dict({k: dict(v) for k, v in iteritems(COMPLEX_DEFINITIONS)})
ISO_DEFINITIONS[ATTRIBUTES].update({
'_definition_source': '{_definition_src}',
'__definition_source': '{__definition_src}',
'___definition_source': '{___definition_src}'
})
Examples
Any of the supported parsers can be extended to include more of a standard's supported data. In this example we'll add two new properties to the IsoParser
:
metadata_language
: a simple string field describing the language of the metadata file itself (not the dataset)metadata_contacts
: a complex structure with contact info leveraging and enhancing the existing contact structure
This example will cover:
- Adding a new simple property
- Configuring a secondary location for a property value
- Referencing an element attribute in an XPATH
- Adding a new complex property
- Customizing the complex property to include a new sub-property
Also, this example is specifically covered by unit tests.
from gis_metadata.iso_metadata_parser import IsoParser
from gis_metadata.utils import COMPLEX_DEFINITIONS, CONTACTS, format_xpaths, ParserProperty
class CustomIsoParser(IsoParser):
def _init_data_map(self):
super(CustomIsoParser, self)._init_data_map()
lang_prop = 'metadata_language'
self._data_map[lang_prop] = 'language/CharacterString'
self._data_map['_' + lang_prop] = 'language/LanguageCode/@codeListValue'
ct_prop = 'metadata_contacts'
ct_xpath = 'contact/CI_ResponsibleParty/{ct_path}'
ct_defintion = COMPLEX_DEFINITIONS[CONTACTS]
ct_defintion['phone'] = '{phone}'
self._data_structures[ct_prop] = format_xpaths(
ct_defintion,
name=ct_xpath.format(ct_path='individualName/CharacterString'),
organization=ct_xpath.format(ct_path='organisationName/CharacterString'),
position=ct_xpath.format(ct_path='positionName/CharacterString'),
phone=ct_xpath.format(
ct_path='contactInfo/CI_Contact/phone/CI_Telephone/voice/CharacterString'
),
email=ct_xpath.format(
ct_path='contactInfo/CI_Contact/address/CI_Address/electronicMailAddress/CharacterString'
)
)
self._data_map['_{prop}_root'.format(prop=ct_prop)] = 'contact'
self._data_map[ct_prop] = ParserProperty(self._parse_complex_list, self._update_complex_list)
self._metadata_props.add(lang_prop)
self._metadata_props.add(ct_prop)
with open(r'/path/to/metadata.xml') as metadata:
iso_from_file = CustomIsoParser(metadata)
iso_from_file.metadata_language
iso_from_file.metadata_contacts