text-categorizer
Advanced tools
Comparing version 1.0.3 to 1.0.4
@@ -14,2 +14,4 @@ export declare const PATTERNS: { | ||
FILE_PATH: RegExp; | ||
MEASUREMENT: RegExp; | ||
SQL: RegExp; | ||
}; |
@@ -8,3 +8,3 @@ export const PATTERNS = { | ||
DATE: /\b(\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}|\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,4})\b/gi, | ||
EQUATION: /[\d\s]*[+\-*/()=]+[\d\s]*/, | ||
EQUATION: /(?:\d+\s*|\(\s*)[+\-*/()=]+\s*(?:\d+\s*|\)) /, | ||
PHONE: /(?:\+\d{1,3}\s?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g, | ||
@@ -14,3 +14,5 @@ ADDRESS: /\d+\s+([A-Za-z]+(\.?\s|\.)){1,}\s*,?\s*([A-Za-z]+\s*,\s*)?[A-Z]{2}\s*\d{5}(-\d{4})?/g, | ||
PRODUCT_CODE: /^[A-Z0-9]{3,}-[A-Z0-9]{3,}$|^[A-Z]{2,4}\d{4,}$/, | ||
FILE_PATH: /^(?:[a-zA-Z]:\\|\/|\.\/|\.\.\/)(?:[^\\\/:*?"<>|\r\n]+\\)*[^\\\/:*?"<>|\r\n]*$/ | ||
FILE_PATH: /^(?:[a-zA-Z]:\\|\/|\.\/|\.\.\/)(?:[^\\\/:*?"<>|\r\n]+\\)*[^\\\/:*?"<>|\r\n]*$/, | ||
MEASUREMENT: /(\d+(\.\d+)?\s*(cm|mm|in|ft|yd|mi|km|g|kg|lb|oz|ml|l|tsp|tbsp|cup|pt|qt|gal|fl oz|in²|ft²|yd²|mi²|km²|ac|ha|sq mi|sq km|°F|°C|K|°|'|"))/gi, | ||
SQL: /\b(SELECT|INSERT|UPDATE|DELETE|FROM|WHERE|JOIN|GROUP BY|ORDER BY|HAVING|CREATE|ALTER|DROP|TABLE|INDEX)\b/i | ||
}; |
@@ -15,3 +15,3 @@ import type { ContentCategory } from "./types"; | ||
private static extractSocialElements; | ||
private static isSearchQuery; | ||
private static isSearchParams; | ||
private static extractDates; | ||
@@ -26,2 +26,3 @@ private static isPhoneNumber; | ||
private static isProductCode; | ||
private static isMeasurement; | ||
private static parseCurrency; | ||
@@ -28,0 +29,0 @@ private static parseCsv; |
@@ -7,3 +7,3 @@ import { PATTERNS } from "./constants"; | ||
static containsLinks(text) { | ||
return text.match(PATTERNS.URL) || []; | ||
return text.match(PATTERNS.URL) || undefined; | ||
} | ||
@@ -91,9 +91,5 @@ static isList(text) { | ||
} | ||
static isSearchQuery(text) { | ||
const searchPatterns = [ | ||
/^(what|how|who|where|when|why)\s.+\??$/i, | ||
/^["'].+["']\s*(site:|filetype:|OR|AND)/i, | ||
/^[^.!?]+\??$/ | ||
]; | ||
return searchPatterns.some((pattern) => pattern.test(text.trim())); | ||
static isSearchParams(text) { | ||
const searchPattern = /(\?|\&)?[a-zA-Z0-9_]+=[^&]*/; | ||
return searchPattern.test(text.trim()); | ||
} | ||
@@ -129,4 +125,3 @@ static extractDates(text) { | ||
static isSql(text) { | ||
const sqlKeywords = /\b(SELECT|INSERT|UPDATE|DELETE|FROM|WHERE|JOIN|GROUP BY|ORDER BY|HAVING|CREATE|ALTER|DROP|TABLE|INDEX)\b/i; | ||
return sqlKeywords.test(text) && text.includes(";"); | ||
return PATTERNS.SQL.test(text) && text.includes(";"); | ||
} | ||
@@ -142,2 +137,5 @@ static isFilePath(text) { | ||
} | ||
static isMeasurement(text) { | ||
return PATTERNS.MEASUREMENT.test(text); | ||
} | ||
static parseCurrency(text) { | ||
@@ -249,9 +247,2 @@ const match = text.match(PATTERNS.CURRENCY); | ||
} | ||
if (this.isSearchQuery(content)) { | ||
return { | ||
type: "query", | ||
content, | ||
metadata: { confidence: 0.8 } | ||
}; | ||
} | ||
const dates = this.extractDates(content); | ||
@@ -289,2 +280,26 @@ if (dates.length > 0) { | ||
} | ||
if (this.isEquation(content)) { | ||
return { | ||
type: "equation", | ||
content, | ||
metadata: { format: "math" } | ||
}; | ||
} | ||
if (this.isSearchParams(content)) { | ||
return { | ||
type: "search", | ||
content, | ||
metadata: { links: this.containsLinks(content) } | ||
}; | ||
} | ||
if (this.isMeasurement(content)) { | ||
return { | ||
type: "measurement", | ||
content, | ||
metadata: { | ||
amount: parseFloat(content.match(PATTERNS.MEASUREMENT)?.[0] || ""), | ||
unit: content.match(/[a-zA-Z]+/)?.[0] || "" | ||
} | ||
}; | ||
} | ||
return { type: "text", content }; | ||
@@ -291,0 +306,0 @@ } |
export type ContentCategory = { | ||
type: "link" | "code" | "list" | "text" | "email" | "json" | "markdown" | "equation" | "date" | "social" | "query" | "phone" | "address" | "csv" | "xml" | "sql" | "filepath" | "currency" | "productCode"; | ||
type: "link" | "code" | "list" | "text" | "email" | "json" | "markdown" | "equation" | "date" | "social" | "search" | "phone" | "address" | "csv" | "xml" | "sql" | "filepath" | "currency" | "productCode" | "measurement"; | ||
content: string; | ||
@@ -18,3 +18,4 @@ metadata?: { | ||
structured?: any; | ||
unit?: string; | ||
}; | ||
}; |
{ | ||
"name": "text-categorizer", | ||
"version": "1.0.3", | ||
"version": "1.0.4", | ||
"description": "Intelligent text content type detection and classification", | ||
@@ -45,6 +45,9 @@ "main": "dist/index.js", | ||
"devDependencies": { | ||
"@types/jest": "^29.5.14", | ||
"@types/node": "^18.0.0", | ||
"typescript": "^4.9.0", | ||
"eslint": "^8.0.0" | ||
"eslint": "^8.0.0", | ||
"jest": "^29.7.0", | ||
"ts-jest": "^29.2.5", | ||
"typescript": "^4.9.0" | ||
} | ||
} |
@@ -183,3 +183,8 @@ # Text Categorizer | ||
- Product codes | ||
- Measurement | ||
## TODO | ||
- Measurements | ||
## Contributing | ||
@@ -186,0 +191,0 @@ |
@@ -8,3 +8,3 @@ export const PATTERNS = { | ||
DATE: /\b(\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}|\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,4})\b/gi, | ||
EQUATION: /[\d\s]*[+\-*/()=]+[\d\s]*/, | ||
EQUATION: /(?:\d+\s*|\(\s*)[+\-*/()=]+\s*(?:\d+\s*|\)) /, ///[\d\s]*[+\-*/()=]+[\d\s]*/, | ||
PHONE: /(?:\+\d{1,3}\s?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g, | ||
@@ -16,3 +16,6 @@ ADDRESS: | ||
FILE_PATH: | ||
/^(?:[a-zA-Z]:\\|\/|\.\/|\.\.\/)(?:[^\\\/:*?"<>|\r\n]+\\)*[^\\\/:*?"<>|\r\n]*$/ | ||
/^(?:[a-zA-Z]:\\|\/|\.\/|\.\.\/)(?:[^\\\/:*?"<>|\r\n]+\\)*[^\\\/:*?"<>|\r\n]*$/, | ||
MEASUREMENT: | ||
/(\d+(\.\d+)?\s*(cm|mm|in|ft|yd|mi|km|g|kg|lb|oz|ml|l|tsp|tbsp|cup|pt|qt|gal|fl oz|in²|ft²|yd²|mi²|km²|ac|ha|sq mi|sq km|°F|°C|K|°|'|"))/gi, | ||
SQL: /\b(SELECT|INSERT|UPDATE|DELETE|FROM|WHERE|JOIN|GROUP BY|ORDER BY|HAVING|CREATE|ALTER|DROP|TABLE|INDEX)\b/i | ||
}; |
@@ -11,4 +11,4 @@ import { PATTERNS } from "./constants"; | ||
private static containsLinks(text: string): string[] { | ||
return text.match(PATTERNS.URL) || []; | ||
private static containsLinks(text: string): string[] | undefined { | ||
return text.match(PATTERNS.URL) || undefined; | ||
} | ||
@@ -116,10 +116,6 @@ | ||
private static isSearchQuery(text: string): boolean { | ||
const searchPatterns = [ | ||
/^(what|how|who|where|when|why)\s.+\??$/i, | ||
/^["'].+["']\s*(site:|filetype:|OR|AND)/i, | ||
/^[^.!?]+\??$/ | ||
]; | ||
private static isSearchParams(text: string): boolean { | ||
const searchPattern = /(\?|\&)?[a-zA-Z0-9_]+=[^&]*/; | ||
return searchPatterns.some((pattern) => pattern.test(text.trim())); | ||
return searchPattern.test(text.trim()); | ||
} | ||
@@ -167,5 +163,3 @@ | ||
private static isSql(text: string): boolean { | ||
const sqlKeywords = | ||
/\b(SELECT|INSERT|UPDATE|DELETE|FROM|WHERE|JOIN|GROUP BY|ORDER BY|HAVING|CREATE|ALTER|DROP|TABLE|INDEX)\b/i; | ||
return sqlKeywords.test(text) && text.includes(";"); | ||
return PATTERNS.SQL.test(text) && text.includes(";"); | ||
} | ||
@@ -185,2 +179,6 @@ | ||
private static isMeasurement(text: string): boolean { | ||
return PATTERNS.MEASUREMENT.test(text); | ||
} | ||
private static parseCurrency(text: string): { | ||
@@ -313,10 +311,2 @@ amount: number; | ||
if (this.isSearchQuery(content)) { | ||
return { | ||
type: "query", | ||
content, | ||
metadata: { confidence: 0.8 } | ||
}; | ||
} | ||
const dates = this.extractDates(content); | ||
@@ -358,2 +348,29 @@ if (dates.length > 0) { | ||
if (this.isEquation(content)) { | ||
return { | ||
type: "equation", | ||
content, | ||
metadata: { format: "math" } | ||
}; | ||
} | ||
if (this.isSearchParams(content)) { | ||
return { | ||
type: "search", | ||
content, | ||
metadata: { links: this.containsLinks(content) } | ||
}; | ||
} | ||
if (this.isMeasurement(content)) { | ||
return { | ||
type: "measurement", | ||
content, | ||
metadata: { | ||
amount: parseFloat(content.match(PATTERNS.MEASUREMENT)?.[0] || ""), | ||
unit: content.match(/[a-zA-Z]+/)?.[0] || "" | ||
} | ||
}; | ||
} | ||
return { type: "text", content }; | ||
@@ -360,0 +377,0 @@ } |
@@ -13,3 +13,3 @@ export type ContentCategory = { | ||
| "social" | ||
| "query" | ||
| "search" | ||
| "phone" | ||
@@ -22,3 +22,4 @@ | "address" | ||
| "currency" | ||
| "productCode"; | ||
| "productCode" | ||
| "measurement"; | ||
content: string; | ||
@@ -39,3 +40,4 @@ metadata?: { | ||
structured?: any; | ||
unit?: string; | ||
}; | ||
}; |
41974
21
985
239
6