type-analyzer
Advanced tools
Comparing version 0.2.1 to 0.2.2
@@ -19,2 +19,8 @@ | ||
#### [0.2.2] - Oct 07 2019 (b913061) | ||
37de75d add eslint and prettier | ||
0f10ab9 [Doc] Update README to add ignoredDataTypes | ||
3e0134e remove TYPE city all together | ||
2cf641b port internal type-analyzer commits over (currently at 1.5.0) | ||
#### [0.2.1] - Jan 03 2019 (c6fb704) | ||
@@ -21,0 +27,0 @@ 4c2a767 Add CHANGELOG.md |
{ | ||
"name": "type-analyzer", | ||
"description": "Infer types from columns in JSON", | ||
"version": "0.2.1", | ||
"version": "0.2.2", | ||
"main": "index.js", | ||
@@ -11,3 +11,3 @@ "repository": { | ||
"scripts": { | ||
"lint": "eslint src test", | ||
"lint": "eslint src test --fix", | ||
"test": "tape test" | ||
@@ -18,11 +18,9 @@ }, | ||
"eslint": "^4.4.1", | ||
"eslint-config-uber-es5": "^2.0.3", | ||
"eslint-config-prettier": "^4.3.0", | ||
"eslint-config-uber-es2015": "^3.1.2", | ||
"eslint-plugin-prettier": "^3.1.0", | ||
"eslint-plugin-es5": "^1.4.1", | ||
"tape": "^4.6.3", | ||
"tape-run": "^2.1.4" | ||
}, | ||
"eslintConfig": { | ||
"extends": [ | ||
"eslint-config-uber-es5" | ||
] | ||
}, | ||
"contributors": [ | ||
@@ -36,2 +34,6 @@ { | ||
"email": "chua@uber.com" | ||
}, | ||
{ | ||
"name": "Shan He", | ||
"email": "shan@uber.com" | ||
} | ||
@@ -38,0 +40,0 @@ ], |
# type-analyzer | ||
> Infer types from CSV columns. | ||
Infer data types from CSV columns. | ||
@@ -10,16 +10,17 @@ ## Overview | ||
* Geo-JSON, | ||
* WKT Geometry, | ||
* Boolean, | ||
* Currency, | ||
* Percent, | ||
* DateTime, | ||
* Date, | ||
* Time, | ||
* Int, | ||
* Float, | ||
* Number, | ||
* Zipcode, | ||
* City, | ||
* String | ||
* **DATE** | ||
* **TIME** | ||
* **DATETIME** | ||
* **NUMBER** | ||
* **INT** | ||
* **FLOAT** | ||
* **CURRENCY** | ||
* **PERCENT** | ||
* **STRING** | ||
* **ZIPCODE** | ||
* **BOOLEAN** | ||
* **GEOMETRY** | ||
* **GEOMETRY_FROM_STRING** | ||
* **PAIR_GEOMETRY_FROM_STRING** | ||
* **NONE** | ||
@@ -32,13 +33,36 @@ ## Installation | ||
Usage is super simple, simply call computeColMeta on your data like so | ||
### `Analyzer.computeColMeta(data, rules, options)` (Function) | ||
**Parameters** | ||
- `data` **Array** _required_ An array of row object | ||
- `rules` **Array** _optional_ An array of custom regex rules | ||
- `options` **Object** _optional_ Option object | ||
- `options.ignoreDataTypes` **Array** _optional_ Data types to ignore | ||
```js | ||
var Analyzer = require('type-analyzer').Analyzer; | ||
var data = [ | ||
{ | ||
"ST_AsText": "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))", | ||
"name": "san_francisco", | ||
"lat": "37.7749295", | ||
"lng": "-122.4194155", | ||
"launch_date": "2010-06-05", | ||
"added_at": "2010-06-05 12:00" | ||
}, | ||
{ | ||
"ST_AsText": "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))", | ||
"name": "paris", | ||
"lat": "48.856666", | ||
"lng": "2.3509871", | ||
"launch_date": "2011-12-04", | ||
"added_at": "2010-06-05 12:00" | ||
}, | ||
] | ||
var colMeta = Analyzer.computeColMeta(data); | ||
``` | ||
- **`rules`** | ||
But imagine you want to ensure that a column full of ids represented as numbers | ||
is identified as a column of strings, type-analyzer's got you. Simply pass an | ||
array of rules: | ||
You can pass in an array of custom rules. For example. if you want to ensure that a column full of ids represented as numbers is identified as a column of strings. Rules can be matched with either exact `name` of the column, or `regex` used to match names. Note: Analyzer prefers rules using name over regex since better performance. | ||
@@ -51,4 +75,12 @@ ```js | ||
var colMeta = Analyzer.computeColMeta(data, [{regex: /id/, dataType: 'STRING'}]); | ||
``` | ||
Note: Analyzer prefers rules using name over regex since better performance. | ||
- **`options.ignoreDataTypes`** | ||
You can also pass in `ignoreDataTypes` to ignore certain types. This will improve your type checking performance. | ||
```js | ||
var DATA_TYPES = require('type-analyzer').DATA_TYPES; | ||
var colMeta = Analyzer.computeColMeta(arr, [], {ignoredDataTypes: DATA_TYPES.CURRENCY})[0].type, | ||
``` | ||
@@ -59,3 +91,7 @@ | ||
### `DATA_TYPES` | ||
You can import all availale types as a constant. | ||
## Update | ||
@@ -62,0 +98,0 @@ Breaking changes with v1.0.0: Regex has moved into src, but can more easily be |
@@ -20,3 +20,2 @@ // Copyright (c) 2017 Uber Technologies, Inc. | ||
// THE SOFTWARE. | ||
'use strict'; | ||
@@ -43,9 +42,13 @@ | ||
/** | ||
* Check if a given value is a null for a validator | ||
* @param {String} value - value to be checked if null | ||
* @param {String} validatorName - the name of the current validation function | ||
* @return {Boolean} whether or not the current value is null | ||
**/ | ||
* Check if a given value is a null for a validator | ||
* @param {String} value - value to be checked if null | ||
* @param {String} validatorName - the name of the current validation function | ||
* @return {Boolean} whether or not the current value is null | ||
**/ | ||
function valueIsNullForValidator(value, validatorName) { | ||
if (value === null || value === CONSTANT.NULL || typeof value === 'undefined') { | ||
if ( | ||
value === null || | ||
value === CONSTANT.NULL || | ||
typeof value === 'undefined' | ||
) { | ||
return true; | ||
@@ -68,11 +71,14 @@ } | ||
}); | ||
var validator = VALIDATOR_MAP[validatorName]; | ||
var strikes = Math.min(NUMBER_OF_ALLOWED_HITS, nonNullData.length); | ||
var hits = 0; | ||
nonNullData.some(function iterateAcrossData(row) { | ||
var value = row[columnName]; | ||
if (Boolean(VALIDATOR_MAP[validatorName](value)) === false) { | ||
strikes -= 1; | ||
var isValueValid = Boolean(validator(row[columnName])); | ||
if (isValueValid) { | ||
hits++; | ||
} else { | ||
hits += 1; | ||
strikes--; | ||
} | ||
if (strikes <= 0) { | ||
@@ -83,2 +89,3 @@ return true; | ||
}); | ||
return strikes > 0 && hits > 0; | ||
@@ -104,11 +111,25 @@ }; | ||
/** | ||
* Generate metadata about columns in a dataset | ||
* @param {Object} data - data for which meta will be generated | ||
* @param {Object} analyzerRules - regexs describing column overrides | ||
* @return {Object} column metadata | ||
**/ | ||
Analyzer.computeColMeta = function computeColMeta(data, analyzerRules) { | ||
* Generate metadata about columns in a dataset | ||
* @param {Object} data - data for which meta will be generated | ||
* @param {Object} analyzerRules - regexs describing column overrides | ||
* @param {Object.array} ignoredDataTypes - array of datatypes to ignore when validating | ||
* @return {Object} column metadata | ||
**/ | ||
Analyzer.computeColMeta = function computeColMeta( | ||
data, | ||
analyzerRules, | ||
options | ||
) { | ||
var ignoredDataTypes = (options || {}).ignoredDataTypes || []; | ||
var allValidators = CONSTANT.VALIDATORS.filter(function filterValidators( | ||
validator | ||
) { | ||
return this.indexOf(validator) < 0; | ||
}, | ||
ignoredDataTypes); | ||
if (!data || Object.keys(data).length === 0) { | ||
return []; | ||
} | ||
var _columns = Object.keys(data[0]); | ||
@@ -120,5 +141,5 @@ /* eslint-disable max-statements */ | ||
var type = getTypeFromRules(analyzerRules, columnName); | ||
// If it's not there then try to infer the type | ||
// ff it's not there then try to infer the type | ||
if (!type) { | ||
type = CONSTANT.VALIDATORS.find(buildValidatorFinder(data, columnName)); | ||
type = allValidators.find(buildValidatorFinder(data, columnName)); | ||
} | ||
@@ -143,5 +164,5 @@ // if theres still no type, dump this column | ||
label: columnName, | ||
type: type, | ||
category: category, | ||
format: format | ||
type, | ||
category, | ||
format | ||
}; | ||
@@ -154,3 +175,6 @@ | ||
} | ||
colMeta.geoType = typeof geoSample.type === 'string' ? geoSample.type.toUpperCase() : null; | ||
colMeta.geoType = | ||
typeof geoSample.type === 'string' | ||
? geoSample.type.toUpperCase() | ||
: null; | ||
} | ||
@@ -157,0 +181,0 @@ if (type === CONSTANT.DATA_TYPES.GEOMETRY_FROM_STRING) { |
@@ -20,3 +20,2 @@ // Copyright (c) 2017 Uber Technologies, Inc. | ||
// THE SOFTWARE. | ||
'use strict'; | ||
@@ -40,3 +39,2 @@ | ||
STRING: 'STRING', | ||
CITY: 'CITY', | ||
ZIPCODE: 'ZIPCODE', | ||
@@ -85,3 +83,2 @@ | ||
CONSTANT.DATA_TYPES.BOOLEAN, | ||
CONSTANT.DATA_TYPES.CITY, | ||
CONSTANT.DATA_TYPES.ZIPCODE | ||
@@ -98,4 +95,4 @@ ]; | ||
CONSTANT.TYPES_TO_CATEGORIES = Object.keys(CONSTANT.POSSIBLE_TYPES) | ||
.reduce(function generateTypeToCategoryMap(res, category) { | ||
CONSTANT.TYPES_TO_CATEGORIES = Object.keys(CONSTANT.POSSIBLE_TYPES).reduce( | ||
function generateTypeToCategoryMap(res, category) { | ||
CONSTANT.POSSIBLE_TYPES[category].forEach(function loopAcrossTypes(type) { | ||
@@ -105,3 +102,5 @@ res[type] = category; | ||
return res; | ||
}, {}); | ||
}, | ||
{} | ||
); | ||
@@ -138,3 +137,2 @@ // NOTE: the order of validator is important. | ||
CONSTANT.DATA_TYPES.ZIPCODE, | ||
CONSTANT.DATA_TYPES.CITY, | ||
CONSTANT.DATA_TYPES.STRING | ||
@@ -141,0 +139,0 @@ ]; |
@@ -20,4 +20,4 @@ // Copyright (c) 2017 Uber Technologies, Inc. | ||
// THE SOFTWARE. | ||
'use strict'; | ||
'use strict'; | ||
var TimeRegex = require('./time-regex'); | ||
@@ -48,6 +48,2 @@ | ||
// maybe we should import a list of cities we have. | ||
// reference: http://stackoverflow.com/a/25677072 | ||
isCity: /^([a-zA-Z\u0080-\u024F]+(?:. |-| |'))*[a-zA-Z\u0080-\u024F]*$/, | ||
isTime: TimeRegex.ALL_TIME_FORMAT_REGEX, | ||
@@ -54,0 +50,0 @@ |
@@ -20,4 +20,4 @@ // Copyright (c) 2017 Uber Technologies, Inc. | ||
// THE SOFTWARE. | ||
'use strict'; | ||
'use strict'; | ||
/** | ||
@@ -63,2 +63,3 @@ * Given an array of regexes to union, build a string of them | ||
].reverse(); | ||
// the reverse is important to put the more specific regexs higher in the order | ||
@@ -80,7 +81,9 @@ var TIME_FORMAT_REGEX_STRINGS = [ | ||
// {'(\d{2)....': 'M-D-YYYY'} | ||
var TIME_FORMAT_REGEX_MAP = TIME_FORMAT_STRINGS | ||
.reduce(function generateRegexMap(timeFormats, str, index) { | ||
var TIME_FORMAT_REGEX_MAP = TIME_FORMAT_STRINGS.reduce( | ||
function generateRegexMap(timeFormats, str, index) { | ||
timeFormats[TIME_FORMAT_REGEX_STRINGS[index]] = str; | ||
return timeFormats; | ||
}, {}); | ||
}, | ||
{} | ||
); | ||
@@ -134,2 +137,3 @@ var ALL_TIME_FORMAT_REGEX_STR = union(Object.keys(TIME_FORMAT_REGEX_MAP)); | ||
]; | ||
var dateFormatStrings = [ | ||
@@ -148,7 +152,11 @@ 'YYYY-M-D', | ||
// {'(\d{2)....': 'M-D-YYYY'} | ||
var DATE_FORMAT_REGEX_MAP = dateFormatStrings | ||
.reduce(function generateRegexMap(dateFormats, str, index) { | ||
dateFormats[dateFormatRegexStrings[index]] = str; | ||
return dateFormats; | ||
}, {}); | ||
var DATE_FORMAT_REGEX_MAP = dateFormatStrings.reduce(function generateRegexMap( | ||
dateFormats, | ||
str, | ||
index | ||
) { | ||
dateFormats[dateFormatRegexStrings[index]] = str; | ||
return dateFormats; | ||
}, | ||
{}); | ||
@@ -158,6 +166,8 @@ // COMPUTE THEIR CROSS PRODUCT | ||
// {'SOME HELLISH REGEX': 'YYYY HH:MM:SS'} | ||
var DATE_TIME_MAP = Object.keys(DATE_FORMAT_REGEX_MAP) | ||
.reduce(function reduceDate(dateTimes, dateRegex) { | ||
var DATE_TIME_MAP = Object.keys(DATE_FORMAT_REGEX_MAP).reduce( | ||
function reduceDate(dateTimes, dateRegex) { | ||
var dateStr = DATE_FORMAT_REGEX_MAP[dateRegex]; | ||
Object.keys(TIME_FORMAT_REGEX_MAP).forEach(function loopAcrosTimes(timeRegex) { | ||
Object.keys(TIME_FORMAT_REGEX_MAP).forEach(function loopAcrosTimes( | ||
timeRegex | ||
) { | ||
var timeStr = TIME_FORMAT_REGEX_MAP[timeRegex]; | ||
@@ -170,3 +180,5 @@ dateTimes[dateRegex + ' ' + timeRegex] = dateStr + ' ' + timeStr; | ||
return dateTimes; | ||
}, {}); | ||
}, | ||
{} | ||
); | ||
var ALL_DATE_TIME_REGEX = new RegExp(union(Object.keys(DATE_TIME_MAP))); | ||
@@ -173,0 +185,0 @@ |
@@ -67,5 +67,4 @@ // Copyright (c) 2017 Uber Technologies, Inc. | ||
VALIDATOR_MAP[DATA_TYPES.ZIPCODE] = Utils.buildRegexCheck('isZipCode'); | ||
VALIDATOR_MAP[DATA_TYPES.CITY] = Utils.buildRegexCheck('isCity'); | ||
VALIDATOR_MAP[DATA_TYPES.STRING] = Utils.isString; | ||
module.exports = VALIDATOR_MAP; |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
145624
26
1841
97
7
1