Comparing version 0.1.1 to 0.2.0
@@ -172,4 +172,11 @@ import { Har } from "har-format"; | ||
* @param request The request to process in our internal request format. | ||
* @param options An optional object that can configure the following options: | ||
* | ||
* - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match | ||
* the request but indicator values are provided, this function will fall back to indicator matching and try to | ||
* find the indicator values in the request headers, path or body. See {@link IndicatorValues}. | ||
*/ | ||
export const processRequest: (request: Request) => AnnotatedResult | undefined; | ||
export const processRequest: (request: Request, options?: { | ||
indicatorValues?: IndicatorValues; | ||
}) => AnnotatedResult | undefined; | ||
/** | ||
@@ -179,3 +186,4 @@ * Extended version of the {@link Result} type that includes additional metadata about the detected tracking. Each entry | ||
* | ||
* - `adapter`: The adapter that detected the tracking data (`<tracker slug>/<adapter slug>`). | ||
* - `adapter`: The adapter that detected the tracking data (`<tracker slug>/<adapter slug>`) or `indicators` if the entry | ||
* was detected through indicator matching. | ||
* - `property`: The type of tracking data that was detected. | ||
@@ -187,18 +195,51 @@ * - `value`: The actual value of the tracking data that was transmitted. | ||
* This is unavoidable as the trackers don't transmit in a standardized format. | ||
* | ||
* If indicator matching was used to detect this entry, the path will point to the first character of the match in the | ||
* respective part of the request. | ||
* - `reasoning`: An explanation of how we concluded that this is information is actually the type of data we labelled it | ||
* as. This can either be a standardized description, or a URL to a more in-depth research report. | ||
* | ||
* If indicator matching was used to detect this entry, the reasoning will be `indicator matching` followed by the | ||
* encoding that was used to match the indicator value in parentheses. | ||
*/ | ||
export type AnnotatedResult = ({ | ||
adapter: string; | ||
property: Property; | ||
property: LiteralUnion<Property, string>; | ||
value: TrackingDataValue; | ||
} & DataPath)[]; | ||
reasoning: DataPath['reasoning'] | 'indicator matching (plain text)' | 'indicator matching (base64)' | 'indicator matching (URL-encoded)'; | ||
} & Omit<DataPath, 'reasoning'>)[]; | ||
/** | ||
* A mapping from properties (standardized names for certain types of tracking data) to the actual instances of values | ||
* of that property found in a request. | ||
* | ||
* If indicator matching is enabled, it is not possible to distinguish between instances detected through adapter and | ||
* indicator matching. | ||
*/ | ||
export type Result = Partial<Record<Property, TrackingDataValue[]>>; | ||
export type Result = Partial<Record<LiteralUnion<Property, string>, TrackingDataValue[]>>; | ||
/** | ||
* A mapping from properties (standardized names for certain types of tracking data) to indicator values (known honey | ||
* data strings that appear in the request if the property is present). Indicator values can be provided as arrays or | ||
* single strings. They are automatically matched against their encoded versions (e.g. base64 and URL-encoded). Where | ||
* possible, they are matched case-insensitively. | ||
* | ||
* @example | ||
* | ||
* ```ts | ||
* { | ||
* "localIp": ["10.0.0.2", "fd31:4159::a2a1"], | ||
* "idfa": "6a1c1487-a0af-4223-b142-a0f4621d0311" | ||
* } | ||
* ``` | ||
* | ||
* This example means that if the string `10.0.0.2` or `fd31:4159::a2a1` is found in the request, it indicates that the | ||
* local IP is being transmitted. Similarly, if the string `6a1c1487-a0af-4223-b142-a0f4621d0311` is found in the | ||
* request, it indicates that the advertising ID is being transmitted. | ||
*/ | ||
export type IndicatorValues = Partial<Record<LiteralUnion<Property, string>, ArrayOrSingle<string>>>; | ||
/** | ||
* Parse the requests in a HAR traffic dump and extract tracking data. | ||
* | ||
* This always tries to parse requests with the tracker-specific adapters first. If none of them can handle a request, | ||
* and `options.indicatorValues` is provided, it will fall back to indicator matching. | ||
* | ||
* @param har A traffic dump in HAR format. | ||
@@ -210,9 +251,14 @@ * @param options An optional object that can configure the following options: | ||
* result. | ||
* - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match | ||
* the request but indicator values are provided, this function will fall back to indicator matching and try to | ||
* find the indicator values in the request headers, path or body. See {@link IndicatorValues}. | ||
* | ||
* @returns An array of results, corresponding to each request in the HAR file. If a request could not be processed | ||
* (i.e. if no adapter was found that could handle it), the corresponding entry in the array will be `undefined`. | ||
* (i.e. if no adapter was found that could handle it and indicator matching, if enabled, didn't produce any results), | ||
* the corresponding entry in the array will be `undefined`. | ||
*/ | ||
export const process: <ValuesOnly extends boolean = false>(har: Har, options?: { | ||
valuesOnly?: ValuesOnly | undefined; | ||
} | undefined) => Promise<ValuesOnly extends true ? (Partial<Record<Property, any[]>> | undefined)[] : (AnnotatedResult | undefined)[]>; | ||
indicatorValues?: Partial<Record<LiteralUnion<Property, string>, ArrayOrSingle<string>>> | undefined; | ||
} | undefined) => Promise<ValuesOnly extends true ? (Partial<Record<LiteralUnion<Property, string>, any[]>> | undefined)[] : (AnnotatedResult | undefined)[]>; | ||
/** | ||
@@ -219,0 +265,0 @@ * An array of all available adapters. |
{ | ||
"name": "trackhar", | ||
"version": "0.1.1", | ||
"version": "0.2.0", | ||
"description": "Library for detecting tracking data transmissions from traffic in HAR format.", | ||
@@ -54,3 +54,5 @@ "bugs": "https://github.com/tweaselORG/TrackHAR/issues", | ||
"@types/har-format": "^1.2.10", | ||
"base64-search": "^1.0.0", | ||
"cross-dirname": "^0.1.0", | ||
"escape-string-regexp": "^5.0.0", | ||
"jsonpath-plus": "^7.2.0", | ||
@@ -57,0 +59,0 @@ "protobufjs": "^7.2.3", |
113
README.md
@@ -7,7 +7,13 @@ # TrackHAR | ||
To achieve this, TrackHAR uses adapters written for specific tracking endpoints. In our [research](https://benjamin-altpeter.de/doc/thesis-consent-dialogs.pdf), we have found that generic approaches (like indicator matching in the raw transmitted plain text or [base64-encoded](https://github.com/baltpeter/base64-search) request content) are not sufficient due to the frankly ridiculous nesting and obfuscation we observed. In addition, approaches that search for static honey data values can never capture dynamic data types such as free disk space and current RAM usage, or low-entropy values like the operating system version (e.g. `11`). | ||
However, we have also noticed that there is a comparatively small number of tracking endpoints which make up a large portion of all app traffic. This makes our adapter-based approach feasible to detect most of the transmitted tracking data. But it will never be possible to write an adapter for every request. As such, we plan to implement [support for indicator matching](https://github.com/tweaselORG/TrackHAR/issues/6) as a fallback for requests not covered by any adapter in the future. | ||
To achieve this, TrackHAR uses two complementary approaches: adapter-based parsing and indicator matching. | ||
An important additional goal of TrackHAR is to produce outputs that make it possible to automatically generate human-readable documentation that allows people to comprehend why we detected each data transmission. This is especially important to submit complaints against illegal tracking to the data protection authorities. The generation of these reports is not handled by TrackHAR itself, but this requirement influences the design of our adapters and return values. As a result, the adapters are not regular functions that know how to handle a request, but implement a specific custom decoding "language" that can more easily be parsed and reasoned about automatically. | ||
* **Adapter-based parsing**: Our main approach is to use adapters written for specific tracking endpoints. In our [research](https://benjamin-altpeter.de/doc/thesis-consent-dialogs.pdf), we have found that generic approaches (like indicator matching in the raw transmitted plain text or [base64-encoded](https://github.com/baltpeter/base64-search) request content) are not sufficient due to the frankly ridiculous nesting and obfuscation we observed. In addition, approaches that search for static honey data values can never capture dynamic data types such as free disk space and current RAM usage, or low-entropy values like the operating system version (e.g. `11`). | ||
However, we have also noticed that there is a comparatively small number of tracking endpoints which make up a large portion of all app traffic. This makes our adapter-based approach feasible to detect most of the transmitted tracking data. | ||
* **Indicator matching**: But it will never be possible to write an adapter for every request. Thus, we use indicator matching as a fallback for requests not covered by any adapter. Indicator matching relies on the user providing known honey data values (such as the advertising ID or geolocation) that are then searched for in the requests. TrackHAR supports indicator matching for plain text, base64-encoded and URL-encoded values in the request headers, path, or body. It also tries to match case-insensitively where possible. | ||
Note that TrackHAR is designed to err on the side of matching too little instead of overmatching. Both the adapters and indicator matching can miss transmitted tracking data. However conversely, you can be sure that any data that TrackHAR detects is actually transmitted. This is beneficial for research but also legal enforcement against tracking. | ||
An important additional goal of TrackHAR is to produce outputs that make it possible to automatically generate human-readable documentation that allows people to comprehend why we detected each data transmission. This is especially important to submit complaints against illegal tracking to the data protection authorities. The generation of these reports is not handled by TrackHAR itself, but this requirement influences the design of our adapters and return values. As a result, the adapters are not regular functions that know how to handle a request, but implement a specific custom decoding "language" that can more easily be parsed and reasoned about automatically. This documentation is generated in [tweaselORG/tracker-wiki](https://github.com/tweaselORG/tracker-wiki) and hosted at [trackers.tweasel.org](https://trackers.tweasel.org). | ||
## Installation | ||
@@ -111,2 +117,103 @@ | ||
If you want to enable indicator matching for requests not handled by any adapter, you need to provide an object with indicator values for certain properties: | ||
```ts | ||
import { readFile } from 'fs/promises'; | ||
import { process as processHar } from 'trackhar'; | ||
(async () => { | ||
const har = await readFile(process.argv[2], 'utf-8'); | ||
const indicators = { | ||
localIp: [ '10.0.0.2', 'fd31:4159::a2a1' ], | ||
idfa: '6a1c1487-a0af-4223-b142-a0f4621d0311' | ||
}; | ||
const data = await processHar(JSON.parse(har), { indicatorValues: indicators }); | ||
for (const request of data) console.log(request, '\n'); | ||
})(); | ||
``` | ||
With this, we can see that our device's advertising ID was transmitted in the first request, after all: | ||
```ts | ||
[ | ||
{ | ||
adapter: 'indicators', | ||
property: 'idfa', | ||
context: 'body', | ||
path: '$[12]', | ||
reasoning: 'indicator matching (base64)', | ||
value: 'NmExYzE0ODctYTBhZi00MjIzLWIxNDItYTBmNDYyMWQwMzEx' | ||
} | ||
] | ||
// [second request as before…] | ||
``` | ||
In this case, it was not transmitted as plain text but base64-encoded. TrackHAR was still able to detect it. The `path` indicates the index into the body where the IDFA was found. | ||
## Contributing adapters | ||
As stated, TrackHAR uses so-called adapters to detect tracking traffic. They are JavaScript objects defining a decoding algorithm for the request and the paths to the transmitted data in the decoded request. For each endpoint of a tracker, a separate adapter needs to be defined. To determine which adapter fits a request, the URL is matched against the `endpointUrls` of the adapter, which can either just use string matching or a regular expression. If one of the endpoints matches, the adapter is chosen to analyze the request. Where the same endpoint expects different data formats, multiple adapters with identical `endpointUrls` might be required. In that case, the `match` function of an adapter will be used to determine which adapter to apply to a request. The first adapter to return `true` in its matching method is chosen. Only one adapter can match a request at a time. | ||
### Gathering data | ||
If you want to contribute an adapter, first gather some actual traffic that contains requests to the endpoint you want to write an adapter for (you can use [tweaselORG/cyanoacrylate](https://github.com/tweaselORG/cyanoacrylate) for that). For an adapter to make it to the database, the tracker needs to contact the endpoint from two separate apps, prefereably from different developers. From your collected requests, first try to manually decode it, e.g. using [CyberChef](https://gchq.github.io/CyberChef/), and note the steps it took you to decode the request. Then, look at the decoded request and try to determine the types of data transmitted. | ||
One way of doing that is by the property name. Some property names are obviously connected to one data type, e.g. a property named `screen_width` likely contains the screen width of the device. In other cases, the values are very obvious, e.g. `iOS` likely refers to the name of the operating system. In many cases, however, it is not that clear cut. You might recognize a property from specific honey data you planted, such as the longitude and latitude of a fake location which show up in the request data or the IDFA of the device that you know beforehand. In these cases, you need to publicly document your research showing how you reached the conclusion that a specific type of data is present in the request and refer to that research in the adapter via a permalink (we are currently working on a proper place for this, see [tweaselORG/meta#3](https://github.com/tweaselORG/meta/issues/3)). | ||
### Metadata | ||
Adapters are grouped within one file for each tracking company in the `src/adapters` directory. The file name must be the `slug` of the `Tracker` the the adapters in that file belong to. Each file must export an array of adapters as `adapters`. In the tracker file, first define the meta information of the tracker, such as: | ||
```js | ||
const tracker: Tracker = { | ||
slug: '<tracker slug>', | ||
name: '<legal name of the tracking company>', | ||
datenanfragenSlug: '<slug of the tracking company at datarequests.org>', | ||
exodusId: 0, // ID of the tracker in the Exodus database | ||
}; | ||
``` | ||
You can find the `datenanfragenSlug` of the tracking company in [the datarequests.org database](https://www.datarequests.org/company/), by searching for the company and copying the slug from the URL: `https://www.datarequests.org/company/<datenanfragenSlug>/`. For the `exodusId`, search the tracker in [their database](https://reports.exodus-privacy.eu.org/en/trackers/) and again copy the ID from the URL: `https://reports.exodus-privacy.eu.org/en/trackers/<exodusId>/`. | ||
### Adapter matching | ||
After that, you can start by defining the adapter. You can use variables to reuse parts of adapters you need more than once. Start by giving your adapter a `slug`, which needs to be unique within one tracking company. You can also add the tracker information you defined earlier, in the `tracker` property. In the `endpointUrls`, TrackHAR expects an array of strings or regular expressions of all URLs the adapter defines decoding steps and data paths for. Often, you'd want to use a regex to match URLs which might contain some data in the URL as well. TrackHAR always matches against the full URL, including protocol and query. If the requests to two endpoints are similar but slightly different, write two different adapters for them. You should again pull out parts of the adapter into variables to avoid duplicating the code. If there are different requests which require specific handling to the same endpoint, you also need to split your adapter to match only a single type of request. To do that, match the adapter to the same endpoint and define a `match` method in both adapters. It receives a `Request` object containing the raw data of the request and should return `true` if the adapter applies to the request. Typically you’d match against characteristic characters in the body or the `Content-Type` header to determine if you an adapter can parse the request, see e.g. this `match` method of a Facebook adapter: | ||
```js | ||
match: (request) => request.content?.startsWith('{"'), | ||
``` | ||
### Decoding Steps | ||
If your adapter matches, next it tries to decode the data in the request. Therefore, you must define the algorithm to use in order to decode all relevant data in the `decodingSteps` property of your adapter. Because we automatically generate documentation for the adapters, we defined our own schema describing the decoding algorithm: The `decodingSteps` are an array of `DecodingStep` objects, which contain a function and what parts of the request they should work on. The `function` is set as the string name of a predefined decoding function, such as `parseJson` (look at the [API docs](/docs/README.md) for the full set). Each function takes an `input` argument, which expects a path in the global decoding state. This state is basically an object in which you can write temporary data to any property. It is initialized with the data from each context of the request, so the `body` property contains the raw request body, the `query` property contains the raw query string and so on. You can overwrite those values and, if the values are objects, use a JSONPath, like `body.identifiers.idfa`, to access or overwrite its properties. If you want to run a function on each element of an array, you can also use the `mapInput` property of a `DecodingStep` instead of the `input` property. In that case the function will be mapped over the non-empty entries of the array at the input path, like so: `mapInput.filter((i) => i !== undefined && i !== null).map(function)`. Each `DecodingStep` also expects an `output` property, which specifies the variable in the decoding state in which to return the result of the function to. This can simply be a generic variable name, a or a property access with the `.` operator (notably, here we don’t support all the other features of JSON path), if you want to save the value into a property on an object in the state, and, notably, a path on the special `res` object. The `res` object has one property for each context (`body`, `query`, etc.) and this is the object in which TrackHAR expects the final decoded request, which is then passed on to the next processing step. These are the basics of how to construct the decoding algorithm. Let’s take a look at an example from the Facebook graph adapter: | ||
```js | ||
decodingSteps: [ | ||
{ function: 'parseJson', input: 'body', output: 'b' }, | ||
{ function: 'parseJson', input: 'b.batch', output: 'batch' }, | ||
{ function: 'getProperty', mapInput: 'batch', options: { path: 'relative_url' }, output: 'relativeUrls' }, | ||
{ function: 'parseQueryString', mapInput: 'relativeUrls', output: 'res.body.batch' }, | ||
{ function: 'getProperty', input: 'b', options: { path: 'batch_app_id' }, output: 'res.body.batch_app_id' }, | ||
], | ||
``` | ||
As you can see, some functions additonally take in `options`, like the `getProperty` function which needs the path to the property it should read. Functions are run in the order in which they appear in the array. Results are only written to the `output` if they are non-empty. In this case, the adapter first parses the body as a JSON and writes that to the temporary variable `b`. Then, it parses the `batch` property of the previously parsed JSON as a JSON again and writes that to a temporary variable as well. The content of that `batch` variable is an array of similar objects, so the adapter uses `mapInput` to get the property `relative_url` from all its properties and writes them into the temporary array `relativeUrls`. These strings are then parsed as query strings and the resulting array of objects is then written to the special `res` object in the `batch` property of the `body` context. We try to preserve the original property names in the `res` object as best as possible to make it easier to make it more comprehensible, such as in the last step, where the `batch_app_id` on the parsed body is read into the `batch_app_id` property of the result object. | ||
If a function you need doesn’t exist, you’ll need to define a new one in the `decode-functions.ts` and add it to the definition of the `DecodingStep` type in the `index.ts`. Try to keep the function general and simple and not adapter-specific. If you need some adapter specific logic, rather try and abstract it away in many functions and keep the logic in the `decodingSteps`. Your function should also have an easily comprehensible name in camelCase, which describes everything it does. You’ll also need to add a human-readable description to the [tracker-wiki translations](https://github.com/tweaselORG/tracker-wiki/tree/main/i18n). | ||
### Data paths | ||
To extract the data from the decoded request, the adapter specifies the path in which to find the data in the decoded request. This path is obviously a concept that TrackHAR imposes upon the request, because they are rarely in a consistent format. But nevertheless, the goal is to try and keep the path as close to the original data format as possible. The `containedDataPaths` property contains an object, in which each property name is one of different types of data which might be found in the request, with a `DataPath` spec of where to find that data as a value. These should be the result of the research mentioned above, meaning manual analysis of lots of requests or maybe reverse engineering of the formats (e.g. a protobuf schema). A `DataPath` consists of: | ||
- The `context` where the data was found, e.g. the `body`. | ||
- `path`, the JSONPath to access the property in the `res.<context>` object of the decoding context. This can contain [complex JSONPath notation supported by `jsonpath-plus`](https://jsonpath-plus.github.io/JSONPath/docs/ts/#syntax-through-examples). | ||
- The `reasoning` for why the path is assumed to contain this data type. This should either link to further reserach or documentation that makes that point clear, or reference to how either the property name or the value is really obviously connected to the data type. | ||
Properties in `containedDataPaths` may also contain an array of several `DataPath`s, because one data type might be found in several places in a request, e.g. the `language` might be part of the `query` but also a property in the `body`. If a property in a request contains more than one data type, it should be mentioned in all of these data types. For example, a property `body.platform` might contain a value like `Android 13.2`, which contains the `osName` as well as the `osVersion`. In this case, you’ll need to add the path `body.platform` to both of these properties in the `containedDataPaths`. | ||
In case you come across a data type that is not defined in the types yet, add your data type to the `Property` type in the `index.ts`. Give it an obvious, camelCased name and also a human-readable description in the [tracker-wiki translations](https://github.com/tweaselORG/tracker-wiki/tree/main/i18n). | ||
## License | ||
@@ -113,0 +220,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
400809
4259
222
7
1
+ Addedbase64-search@^1.0.0
+ Addedescape-string-regexp@^5.0.0
+ Addedbase64-js@1.5.1(transitive)
+ Addedbase64-search@1.1.0(transitive)
+ Addedbuffer@6.0.3(transitive)
+ Addedescape-string-regexp@5.0.0(transitive)
+ Addedieee754@1.2.1(transitive)