Security News
Research
Data Theft Repackaged: A Case Study in Malicious Wrapper Packages on npm
The Socket Research Team breaks down a malicious wrapper package that uses obfuscation to harvest credentials and exfiltrate sensitive data.
carbon.auth.get_access_token
carbon.auth.get_white_labeling
carbon.crm.get_account
carbon.crm.get_accounts
carbon.crm.get_contact
carbon.crm.get_contacts
carbon.crm.get_lead
carbon.crm.get_leads
carbon.crm.get_opportunities
carbon.crm.get_opportunity
carbon.data_sources.add_tags
carbon.data_sources.query
carbon.data_sources.query_user_data_sources
carbon.data_sources.remove_tags
carbon.data_sources.revoke_access_token
carbon.embeddings.get_documents
carbon.embeddings.get_embeddings_and_chunks
carbon.embeddings.list
carbon.embeddings.upload_chunks_and_embeddings
carbon.files.create_user_file_tags
carbon.files.delete_file_tags
carbon.files.delete_many
carbon.files.delete_v2
carbon.files.get_parsed_file
carbon.files.get_raw_file
carbon.files.modify_cold_storage_parameters
carbon.files.move_to_hot_storage
carbon.files.query_user_files
carbon.files.query_user_files_deprecated
carbon.files.resync
carbon.files.upload
carbon.files.upload_from_url
carbon.files.upload_text
carbon.github.get_issue
carbon.github.get_issues
carbon.github.get_pr
carbon.github.get_pr_comments
carbon.github.get_pr_commits
carbon.github.get_pr_files
carbon.github.get_pull_requests
carbon.integrations.cancel
carbon.integrations.connect_data_source
carbon.integrations.connect_document360
carbon.integrations.connect_freshdesk
carbon.integrations.connect_gitbook
carbon.integrations.connect_guru
carbon.integrations.create_aws_iam_user
carbon.integrations.get_oauth_url
carbon.integrations.list_confluence_pages
carbon.integrations.list_conversations
carbon.integrations.list_data_source_items
carbon.integrations.list_folders
carbon.integrations.list_gitbook_spaces
carbon.integrations.list_labels
carbon.integrations.list_outlook_categories
carbon.integrations.list_repos
carbon.integrations.list_sharepoint_sites
carbon.integrations.sync_azure_blob_files
carbon.integrations.sync_azure_blob_storage
carbon.integrations.sync_confluence
carbon.integrations.sync_data_source_items
carbon.integrations.sync_files
carbon.integrations.sync_git_hub
carbon.integrations.sync_gitbook
carbon.integrations.sync_gmail
carbon.integrations.sync_outlook
carbon.integrations.sync_repos
carbon.integrations.sync_rss_feed
carbon.integrations.sync_s3_files
carbon.integrations.sync_slack
carbon.organizations.get
carbon.organizations.update
carbon.organizations.update_stats
carbon.users.delete
carbon.users.get
carbon.users.list
carbon.users.toggle_user_features
carbon.users.update_users
carbon.users.who_am_i
carbon.utilities.fetch_urls
carbon.utilities.fetch_webpage
carbon.utilities.fetch_youtube_transcripts
carbon.utilities.process_sitemap
carbon.utilities.scrape_sitemap
carbon.utilities.scrape_web
carbon.utilities.search_urls
carbon.utilities.user_webpages
carbon.webhooks.add_url
carbon.webhooks.delete_url
carbon.webhooks.urls
carbon.white_label.create
carbon.white_label.delete
carbon.white_label.list
carbon.white_label.update
Python >=3.7
pip install carbon-python-sdk==0.2.56
from carbon import Carbon
# 1) Get an access token for a customer
carbon = Carbon(
api_key="YOUR_API_KEY",
customer_id="YOUR_CUSTOMER_ID",
)
token = carbon.auth.get_access_token()
# 2) Use the access token to authenticate moving forward
carbon = Carbon(access_token=token.access_token)
# use SDK as usual
white_labeling = carbon.auth.get_white_labeling()
# etc.
async
support is available by prepending a
to any method.
import asyncio
from pprint import pprint
from carbon import Carbon, ApiException
carbon = Carbon(
access_token="YOUR_API_KEY",
api_key="YOUR_API_KEY",
customer_id="YOUR_API_KEY",
)
async def main():
try:
# Get Access Token
get_access_token_response = await carbon.auth.aget_access_token()
print(get_access_token_response)
except ApiException as e:
print("Exception when calling AuthApi.get_access_token: %s\n" % e)
pprint(e.body)
if e.status == 422:
pprint(e.body["detail"])
pprint(e.headers)
pprint(e.status)
pprint(e.reason)
pprint(e.round_trip_time)
asyncio.run(main())
To access raw HTTP response values, use the .raw
namespace.
from pprint import pprint
from carbon import Carbon, ApiException
carbon = Carbon(
access_token="YOUR_API_KEY",
api_key="YOUR_API_KEY",
customer_id="YOUR_API_KEY",
)
try:
# Get Access Token
get_access_token_response = carbon.auth.raw.get_access_token()
pprint(get_access_token_response.body)
pprint(get_access_token_response.body["access_token"])
pprint(get_access_token_response.body["refresh_token"])
pprint(get_access_token_response.headers)
pprint(get_access_token_response.status)
pprint(get_access_token_response.round_trip_time)
except ApiException as e:
print("Exception when calling AuthApi.get_access_token: %s\n" % e)
pprint(e.body)
if e.status == 422:
pprint(e.body["detail"])
pprint(e.headers)
pprint(e.status)
pprint(e.reason)
pprint(e.round_trip_time)
carbon.auth.get_access_token
Get Access Token
get_access_token_response = carbon.auth.get_access_token()
/auth/v1/access_token
get
carbon.auth.get_white_labeling
Returns whether or not the organization is white labeled and which integrations are white labeled
:param current_user: the current user :param db: the database session :return: a WhiteLabelingResponse
get_white_labeling_response = carbon.auth.get_white_labeling()
/auth/v1/white_labeling
get
carbon.crm.get_account
Get Account
get_account_response = carbon.crm.get_account(
id="id_example",
data_source_id=1,
include_remote_data=False,
includes=["string_example"],
)
str
int
bool
BaseIncludes
]/integrations/data/crm/accounts/{id}
get
carbon.crm.get_accounts
Get Accounts
get_accounts_response = carbon.crm.get_accounts(
data_source_id=1,
include_remote_data=False,
next_cursor="string_example",
page_size=1,
order_dir="asc",
includes=[],
filters={},
order_by="created_at",
)
int
bool
Optional[str]
Optional[int]
OrderDirV2Nullable
BaseIncludes
]AccountFilters
AccountsOrderByNullable
/integrations/data/crm/accounts
post
carbon.crm.get_contact
Get Contact
get_contact_response = carbon.crm.get_contact(
id="id_example",
data_source_id=1,
include_remote_data=False,
includes=["string_example"],
)
str
int
bool
BaseIncludes
]/integrations/data/crm/contacts/{id}
get
carbon.crm.get_contacts
Get Contacts
get_contacts_response = carbon.crm.get_contacts(
data_source_id=1,
include_remote_data=False,
next_cursor="string_example",
page_size=1,
order_dir="asc",
includes=[],
filters={},
order_by="created_at",
)
int
bool
Optional[str]
Optional[int]
OrderDirV2Nullable
BaseIncludes
]ContactFilters
ContactsOrderByNullable
/integrations/data/crm/contacts
post
carbon.crm.get_lead
Get Lead
get_lead_response = carbon.crm.get_lead(
id="id_example",
data_source_id=1,
include_remote_data=False,
includes=["string_example"],
)
str
int
bool
BaseIncludes
]/integrations/data/crm/leads/{id}
get
carbon.crm.get_leads
Get Leads
get_leads_response = carbon.crm.get_leads(
data_source_id=1,
include_remote_data=False,
next_cursor="string_example",
page_size=1,
order_dir="asc",
includes=[],
filters={},
order_by="created_at",
)
int
bool
Optional[str]
Optional[int]
OrderDirV2Nullable
BaseIncludes
]LeadFilters
LeadsOrderByNullable
/integrations/data/crm/leads
post
carbon.crm.get_opportunities
Get Opportunities
get_opportunities_response = carbon.crm.get_opportunities(
data_source_id=1,
include_remote_data=False,
next_cursor="string_example",
page_size=1,
order_dir="asc",
includes=[],
filters={
"status": "WON",
},
order_by="created_at",
)
int
bool
Optional[str]
Optional[int]
OrderDirV2Nullable
BaseIncludes
]OpportunityFilters
OpportunitiesOrderByNullable
/integrations/data/crm/opportunities
post
carbon.crm.get_opportunity
Get Opportunity
get_opportunity_response = carbon.crm.get_opportunity(
id="id_example",
data_source_id=1,
include_remote_data=False,
includes=["string_example"],
)
str
int
bool
BaseIncludes
]/integrations/data/crm/opportunities/{id}
get
carbon.data_sources.add_tags
Add Data Source Tags
add_tags_response = carbon.data_sources.add_tags(
tags={},
data_source_id=1,
)
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
int
/data_sources/tags/add
post
carbon.data_sources.query
Data Sources
query_response = carbon.data_sources.query(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"source": "GOOGLE_CLOUD_STORAGE",
},
)
Pagination
OrganizationUserDataSourceOrderByColumns
OrderDir
OrganizationUserDataSourceFilters
OrganizationUserDataSourceQueryInput
OrganizationUserDataSourceResponse
/data_sources
post
carbon.data_sources.query_user_data_sources
User Data Sources
query_user_data_sources_response = carbon.data_sources.query_user_data_sources(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"source": "GOOGLE_CLOUD_STORAGE",
},
)
Pagination
OrganizationUserDataSourceOrderByColumns
OrderDir
OrganizationUserDataSourceFilters
OrganizationUserDataSourceQueryInput
OrganizationUserDataSourceResponse
/user_data_sources
post
carbon.data_sources.remove_tags
Remove Data Source Tags
remove_tags_response = carbon.data_sources.remove_tags(
data_source_id=1,
tags_to_remove=[],
remove_all_tags=False,
)
int
RemoveDataSourceTagsInputTagsToRemove
bool
/data_sources/tags/remove
post
carbon.data_sources.revoke_access_token
Revoke Access Token
revoke_access_token_response = carbon.data_sources.revoke_access_token(
data_source_id=1,
)
int
/revoke_access_token
post
carbon.embeddings.get_documents
For pre-filtering documents, using tags_v2
is preferred to using tags
(which is now deprecated). If both tags_v2
and tags
are specified, tags
is ignored. tags_v2
enables
building complex filters through the use of "AND", "OR", and negation logic. Take the below input as an example:
{
"OR": [
{
"key": "subject",
"value": "holy-bible",
"negate": false
},
{
"key": "person-of-interest",
"value": "jesus christ",
"negate": false
},
{
"key": "genre",
"value": "religion",
"negate": true
}
{
"AND": [
{
"key": "subject",
"value": "tao-te-ching",
"negate": false
},
{
"key": "author",
"value": "lao-tzu",
"negate": false
}
]
}
]
}
In this case, files will be filtered such that:
Note that the top level of the query must be either an "OR" or "AND" array. Currently, nesting is limited to 3. For tag blocks (those with "key", "value", and "negate" keys), the following typing rules apply:
string
any
or list[any
]true
or false
. If present and true
, then the filter block is negated in
the resulting query. It is false
by default.When querying embeddings, you can optionally specify the media_type
parameter in your request. By default (if
not set), it is equal to "TEXT". This means that the query will be performed over files that have
been parsed as text (for now, this covers all files except image files). If it is equal to "IMAGE",
the query will be performed over image files (for now, .jpg
and .png
files). You can think of this
field as an additional filter on top of any filters set in file_ids
and
When hybrid_search
is set to true, a combination of keyword search and semantic search are used to rank
and select candidate embeddings during information retrieval. By default, these search methods are weighted
equally during the ranking process. To adjust the weight (or "importance") of each search method, you can use
the hybrid_search_tuning_parameters
property. The description for the different tuning parameters are:
weight_a
: weight to assign to semantic searchweight_b
: weight to assign to keyword searchYou must ensure that sum(weight_a, weight_b,..., weight_n)
for all n weights is equal to 1. The equality
has an error tolerance of 0.001 to account for possible floating point issues.
In order to use hybrid search for a customer across a set of documents, two flags need to be enabled:
/modify_user_configuration
endpoint to to enable sparse_vectors
for the customer. The payload
body for this request is below:{
"configuration_key_name": "sparse_vectors",
"value": {
"enabled": true
}
}
/uploadfile
endpoint, this can be done by setting the following query parameter: generate_sparse_vectors=true
Carbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's
multimodal model; for text, we support OpenAI's text-embedding-ada-002
and Cohere's embed-multilingual-v3.0.
The model can be specified via the embedding_model
parameter (in the POST body for /embeddings
, and a query
parameter in /uploadfile
). If no model is supplied, the text-embedding-ada-002
is used by default. When performing
embedding queries, embeddings from files that used the specified model will be considered in the query.
For example, if files A and B have embeddings generated with OPENAI
, and files C and D have embeddings generated with
COHERE_MULTILINGUAL_V3
, then by default, queries will only consider files A and B. If COHERE_MULTILINGUAL_V3
is
specified as the embedding_model
in /embeddings
, then only files C and D will be considered. Make sure that
the set of all files you want considered for a query have embeddings generated via the same model. For now, do not
set VERTEX_MULTIMODAL
as an embedding_model
. This model is used automatically by Carbon when it detects an image file.
get_documents_response = carbon.embeddings.get_documents(
query="a",
k=1,
tags={
"key": "string_example",
},
query_vector=[3.14],
file_ids=[1],
parent_file_ids=[1],
include_all_children=False,
tags_v2={},
include_tags=True,
include_vectors=True,
include_raw_file=True,
hybrid_search=True,
hybrid_search_tuning_parameters={
"weight_a": 0.5,
"weight_b": 0.5,
},
media_type="TEXT",
embedding_model="OPENAI",
include_file_level_metadata=False,
high_accuracy=False,
rerank={
"model": "model_example",
},
file_types_at_source=["string_example"],
exclude_cold_storage_files=False,
)
str
Query for which to get related chunks and embeddings.
int
Number of related chunks to return.
GetEmbeddingDocumentsBodyTags
GetEmbeddingDocumentsBodyQueryVector
GetEmbeddingDocumentsBodyFileIds
GetEmbeddingDocumentsBodyParentFileIds
bool
Flag to control whether or not to include all children of filtered files in the embedding search.
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
A set of tags to limit the search to. Use this instead of tags
, which is deprecated.
Optional[bool]
Flag to control whether or not to include tags for each chunk in the response.
Optional[bool]
Flag to control whether or not to include embedding vectors in the response.
Optional[bool]
Flag to control whether or not to include a signed URL to the raw file containing each chunk in the response.
Optional[bool]
Flag to control whether or not to perform hybrid search.
HybridSearchTuningParamsNullable
FileContentTypesNullable
EmbeddingGeneratorsNullable
Optional[bool]
Flag to control whether or not to include file-level metadata in the response. This metadata will be included in the content_metadata
field of each document along with chunk/embedding level metadata.
Optional[bool]
Flag to control whether or not to perform a high accuracy embedding search. By default, this is set to false. If true, the search may return more accurate results, but may take longer to complete.
RerankParamsNullable
GetEmbeddingDocumentsBodyFileTypesAtSource
bool
Flag to control whether or not to exclude files that are not in hot storage. If set to False, then an error will be returned if any filtered files are in cold storage.
/embeddings
post
carbon.embeddings.get_embeddings_and_chunks
Retrieve Embeddings And Content
get_embeddings_and_chunks_response = carbon.embeddings.get_embeddings_and_chunks(
filters={
"user_file_id": 1,
"embedding_model": "OPENAI",
},
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
include_vectors=False,
)
EmbeddingsAndChunksFilters
Pagination
EmbeddingsAndChunksOrderByColumns
OrderDir
bool
/text_chunks
post
carbon.embeddings.list
Retrieve Embeddings And Content V2
list_response = carbon.embeddings.list(
filters={
"include_all_children": False,
"non_synced_only": False,
},
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
include_vectors=False,
)
OrganizationUserFilesToSyncFilters
Pagination
OrganizationUserFilesToSyncOrderByTypes
OrderDir
bool
EmbeddingsAndChunksQueryInputV2
/list_chunks_and_embeddings
post
carbon.embeddings.upload_chunks_and_embeddings
Upload Chunks And Embeddings
upload_chunks_and_embeddings_response = carbon.embeddings.upload_chunks_and_embeddings(
embedding_model="OPENAI",
chunks_and_embeddings=[
{
"file_id": 1,
"chunks_and_embeddings": [
{
"chunk_number": 1,
"chunk": "chunk_example",
}
],
}
],
overwrite_existing=False,
chunks_only=False,
custom_credentials={
"key": {},
},
)
EmbeddingGenerators
SingleChunksAndEmbeddingsUploadInput
]bool
bool
ChunksAndEmbeddingsUploadInputCustomCredentials
ChunksAndEmbeddingsUploadInput
/upload_chunks_and_embeddings
post
carbon.files.create_user_file_tags
A tag is a key-value pair that can be added to a file. This pair can then be used for searches (e.g. embedding searches) in order to narrow down the scope of the search. A file can have any number of tags. The following are reserved keys that cannot be used:
Carbon currently supports two data types for tag values - string
and list<string>
.
Keys can only be string
. If values other than string
and list<string>
are used,
they're automatically converted to strings (e.g. 4 will become "4").
create_user_file_tags_response = carbon.files.create_user_file_tags(
tags={
"key": "string_example",
},
organization_user_file_id=1,
)
OrganizationUserFileTagCreateTags
int
/create_user_file_tags
post
carbon.files.delete_file_tags
Delete File Tags
delete_file_tags_response = carbon.files.delete_file_tags(
tags=["string_example"],
organization_user_file_id=1,
)
OrganizationUserFileTagsRemoveTags
int
OrganizationUserFileTagsRemove
/delete_user_file_tags
post
carbon.files.delete_many
Delete Files Endpoint
delete_many_response = carbon.files.delete_many(
file_ids=[1],
sync_statuses=["string_example"],
delete_non_synced_only=False,
send_webhook=False,
delete_child_files=False,
)
DeleteFilesQueryInputFileIds
ExternalFileSyncStatuses
]bool
bool
bool
/delete_files
post
carbon.files.delete_v2
Delete Files V2 Endpoint
delete_v2_response = carbon.files.delete_v2(
filters={
"include_all_children": False,
"non_synced_only": False,
},
send_webhook=False,
preserve_file_record=False,
)
OrganizationUserFilesToSyncFilters
bool
bool
Whether or not to delete all data related to the file from the database, BUT to preserve the file metadata, allowing for resyncs. By default preserve_file_record
is false, which means that all data related to the file as well as its metadata will be deleted. Note that even if preserve_file_record
is true, raw files uploaded via the uploadfile
endpoint still cannot be resynced.
/delete_files_v2
post
carbon.files.get_parsed_file
This route is deprecated. Use /user_files_v2
instead.
get_parsed_file_response = carbon.files.get_parsed_file(
file_id=1,
)
int
/parsed_file/{file_id}
get
carbon.files.get_raw_file
This route is deprecated. Use /user_files_v2
instead.
get_raw_file_response = carbon.files.get_raw_file(
file_id=1,
)
int
/raw_file/{file_id}
get
carbon.files.modify_cold_storage_parameters
Modify Cold Storage Parameters
modify_cold_storage_parameters_response = carbon.files.modify_cold_storage_parameters(
filters={
"include_all_children": False,
"non_synced_only": False,
},
enable_cold_storage=True,
hot_storage_time_to_live=1,
)
OrganizationUserFilesToSyncFilters
Optional[bool]
Optional[int]
ModifyColdStorageParametersQueryInput
/modify_cold_storage_parameters
post
carbon.files.move_to_hot_storage
Move To Hot Storage
move_to_hot_storage_response = carbon.files.move_to_hot_storage(
filters={
"include_all_children": False,
"non_synced_only": False,
},
)
OrganizationUserFilesToSyncFilters
/move_to_hot_storage
post
carbon.files.query_user_files
For pre-filtering documents, using tags_v2
is preferred to using tags
(which is now deprecated). If both tags_v2
and tags
are specified, tags
is ignored. tags_v2
enables
building complex filters through the use of "AND", "OR", and negation logic. Take the below input as an example:
{
"OR": [
{
"key": "subject",
"value": "holy-bible",
"negate": false
},
{
"key": "person-of-interest",
"value": "jesus christ",
"negate": false
},
{
"key": "genre",
"value": "religion",
"negate": true
}
{
"AND": [
{
"key": "subject",
"value": "tao-te-ching",
"negate": false
},
{
"key": "author",
"value": "lao-tzu",
"negate": false
}
]
}
]
}
In this case, files will be filtered such that:
Note that the top level of the query must be either an "OR" or "AND" array. Currently, nesting is limited to 3. For tag blocks (those with "key", "value", and "negate" keys), the following typing rules apply:
string
any
or list[any
]true
or false
. If present and true
, then the filter block is negated in
the resulting query. It is false
by default.query_user_files_response = carbon.files.query_user_files(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"include_all_children": False,
"non_synced_only": False,
},
include_raw_file=True,
include_parsed_text_file=True,
include_additional_files=True,
presigned_url_expiry_time_seconds=3600,
)
Pagination
OrganizationUserFilesToSyncOrderByTypes
OrderDir
OrganizationUserFilesToSyncFilters
Optional[bool]
If true, the query will return presigned URLs for the raw file. Only relevant for the /user_files_v2 endpoint.
Optional[bool]
If true, the query will return presigned URLs for the parsed text file. Only relevant for the /user_files_v2 endpoint.
Optional[bool]
If true, the query will return presigned URLs for additional files. Only relevant for the /user_files_v2 endpoint.
int
The expiry time for the presigned URLs. Only relevant for the /user_files_v2 endpoint.
OrganizationUserFilesToSyncQueryInput
/user_files_v2
post
carbon.files.query_user_files_deprecated
This route is deprecated. Use /user_files_v2
instead.
query_user_files_deprecated_response = carbon.files.query_user_files_deprecated(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"include_all_children": False,
"non_synced_only": False,
},
include_raw_file=True,
include_parsed_text_file=True,
include_additional_files=True,
presigned_url_expiry_time_seconds=3600,
)
Pagination
OrganizationUserFilesToSyncOrderByTypes
OrderDir
OrganizationUserFilesToSyncFilters
Optional[bool]
If true, the query will return presigned URLs for the raw file. Only relevant for the /user_files_v2 endpoint.
Optional[bool]
If true, the query will return presigned URLs for the parsed text file. Only relevant for the /user_files_v2 endpoint.
Optional[bool]
If true, the query will return presigned URLs for additional files. Only relevant for the /user_files_v2 endpoint.
int
The expiry time for the presigned URLs. Only relevant for the /user_files_v2 endpoint.
OrganizationUserFilesToSyncQueryInput
FilesQueryUserFilesDeprecatedResponse
/user_files
post
carbon.files.resync
Resync File
resync_response = carbon.files.resync(
file_id=1,
chunk_size=1,
chunk_overlap=1,
force_embedding_generation=False,
skip_file_processing=False,
)
int
Optional[int]
Optional[int]
bool
Optional[bool]
/resync_file
post
carbon.files.upload
This endpoint is used to directly upload local files to Carbon. The POST
request should be a multipart form request.
Note that the set_page_as_boundary
query parameter is applicable only to PDFs for now. When this value is set,
PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates
of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description
of all possible query parameters:
chunk_size
: the chunk size (in tokens) applied when splitting the documentchunk_overlap
: the chunk overlap (in tokens) applied when splitting the documentskip_embedding_generation
: whether or not to skip the generation of chunks and embeddingsset_page_as_boundary
: described aboveembedding_model
: the model used to generate embeddings for the document chunksuse_ocr
: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGsgenerate_sparse_vectors
: whether or not to generate sparse vectors for the file. Required for hybrid search.prepend_filename_to_chunks
: whether or not to prepend the filename to the chunk textCarbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's
multimodal model; for text, we support OpenAI's text-embedding-ada-002
and Cohere's embed-multilingual-v3.0.
The model can be specified via the embedding_model
parameter (in the POST body for /embeddings
, and a query
parameter in /uploadfile
). If no model is supplied, the text-embedding-ada-002
is used by default. When performing
embedding queries, embeddings from files that used the specified model will be considered in the query.
For example, if files A and B have embeddings generated with OPENAI
, and files C and D have embeddings generated with
COHERE_MULTILINGUAL_V3
, then by default, queries will only consider files A and B. If COHERE_MULTILINGUAL_V3
is
specified as the embedding_model
in /embeddings
, then only files C and D will be considered. Make sure that
the set of all files you want considered for a query have embeddings generated via the same model. For now, do not
set VERTEX_MULTIMODAL
as an embedding_model
. This model is used automatically by Carbon when it detects an image file.
upload_response = carbon.files.upload(
file=open("/path/to/file", "rb"),
chunk_size=1,
chunk_overlap=1,
skip_embedding_generation=False,
set_page_as_boundary=False,
embedding_model="string_example",
use_ocr=False,
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
parse_pdf_tables_with_ocr=False,
detect_audio_language=False,
transcription_service="assemblyai",
include_speaker_labels=False,
media_type="TEXT",
split_rows=False,
enable_cold_storage=False,
hot_storage_time_to_live=1,
generate_chunks_only=False,
store_file_only=False,
)
IO
Optional[int]
Chunk size in tiktoken tokens to be used when processing file.
Optional[int]
Chunk overlap in tiktoken tokens to be used when processing file.
bool
Flag to control whether or not embeddings should be generated and stored when processing file.
bool
Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information.
TextEmbeddingGenerators
, MultiModalEmbeddingGenerators
]Embedding model that will be used to embed file chunks.
bool
Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text.
bool
Whether or not to generate sparse vectors for the file. This is required for the file to be a candidate for hybrid search.
bool
Whether or not to prepend the file's name to chunks.
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
Whether to use rich table parsing when use_ocr
is enabled.
bool
Whether to automatically detect the language of the uploaded audio file.
TranscriptionServiceNullable
The transcription service to use for audio files. If no service is specified, 'deepgram' will be used.
bool
Detect multiple speakers and label segments of speech by speaker for audio files.
FileContentTypesNullable
The media type of the file. If not provided, it will be inferred from the file extension.
bool
Whether to split tabular rows into chunks. Currently only valid for CSV, TSV, and XLSX files.
bool
Enable cold storage for the file. If set to true, the file will be moved to cold storage after a certain period of inactivity. Default is false.
Optional[int]
Time in days after which the file will be moved to cold storage. Must be one of [1, 3, 7, 14, 30].
bool
If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.
bool
If this flag is enabled, the file will be stored with Carbon, but no processing will be done.
BodyCreateUploadFileUploadfilePost
/uploadfile
post
carbon.files.upload_from_url
Create Upload File From Url
upload_from_url_response = carbon.files.upload_from_url(
url="string_example",
file_name="string_example",
chunk_size=1,
chunk_overlap=1,
skip_embedding_generation=False,
set_page_as_boundary=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
use_textract=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
parse_pdf_tables_with_ocr=False,
detect_audio_language=False,
transcription_service="assemblyai",
include_speaker_labels=False,
media_type="TEXT",
split_rows=False,
cold_storage_params={
"enable_cold_storage": False,
},
generate_chunks_only=False,
store_file_only=False,
)
str
Optional[str]
Optional[int]
Optional[int]
bool
bool
EmbeddingGenerators
bool
bool
bool
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
bool
TranscriptionServiceNullable
bool
FileContentTypesNullable
bool
ColdStorageProps
bool
If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.
bool
If this flag is enabled, the file will be stored with Carbon, but no processing will be done.
/upload_file_from_url
post
carbon.files.upload_text
Carbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's
multimodal model; for text, we support OpenAI's text-embedding-ada-002
and Cohere's embed-multilingual-v3.0.
The model can be specified via the embedding_model
parameter (in the POST body for /embeddings
, and a query
parameter in /uploadfile
). If no model is supplied, the text-embedding-ada-002
is used by default. When performing
embedding queries, embeddings from files that used the specified model will be considered in the query.
For example, if files A and B have embeddings generated with OPENAI
, and files C and D have embeddings generated with
COHERE_MULTILINGUAL_V3
, then by default, queries will only consider files A and B. If COHERE_MULTILINGUAL_V3
is
specified as the embedding_model
in /embeddings
, then only files C and D will be considered. Make sure that
the set of all files you want considered for a query have embeddings generated via the same model. For now, do not
set VERTEX_MULTIMODAL
as an embedding_model
. This model is used automatically by Carbon when it detects an image file.
upload_text_response = carbon.files.upload_text(
contents="aaaaa",
name="string_example",
chunk_size=1,
chunk_overlap=1,
skip_embedding_generation=False,
overwrite_file_id=1,
embedding_model="OPENAI",
generate_sparse_vectors=False,
cold_storage_params={
"enable_cold_storage": False,
},
generate_chunks_only=False,
store_file_only=False,
)
str
Optional[str]
Optional[int]
Optional[int]
bool
Optional[int]
EmbeddingGeneratorsNullable
Optional[bool]
ColdStorageProps
bool
If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.
bool
If this flag is enabled, the file will be stored with Carbon, but no processing will be done.
/upload_text
post
carbon.github.get_issue
Issue
get_issue_response = carbon.github.get_issue(
issue_number=1,
include_remote_data=False,
data_source_id=1,
repository="string_example",
)
int
bool
int
str
/integrations/data/github/issues/{issue_number}
get
carbon.github.get_issues
Issues
get_issues_response = carbon.github.get_issues(
data_source_id=1,
repository="string_example",
include_remote_data=False,
page=1,
page_size=30,
next_cursor="string_example",
filters={
"state": "closed",
},
order_by="created",
order_dir="asc",
)
int
str
Full name of the repository, denoted as {owner}/{repo}
bool
int
int
Optional[str]
IssuesFilter
IssuesOrderBy
OrderDirV2Nullable
/integrations/data/github/issues
post
carbon.github.get_pr
Get Pr
get_pr_response = carbon.github.get_pr(
pull_number=1,
include_remote_data=False,
data_source_id=1,
repository="string_example",
)
int
bool
int
str
/integrations/data/github/pull_requests/{pull_number}
get
carbon.github.get_pr_comments
Pr Comments
get_pr_comments_response = carbon.github.get_pr_comments(
data_source_id=1,
repository="string_example",
pull_number=1,
include_remote_data=False,
page=1,
page_size=30,
next_cursor="string_example",
order_by="created",
order_dir="asc",
)
int
str
Full name of the repository, denoted as {owner}/{repo}
int
bool
int
int
Optional[str]
CommentsOrderBy
OrderDirV2Nullable
/integrations/data/github/pull_requests/comments
post
carbon.github.get_pr_commits
Pr Commits
get_pr_commits_response = carbon.github.get_pr_commits(
data_source_id=1,
repository="string_example",
pull_number=1,
include_remote_data=False,
page=1,
page_size=30,
next_cursor="string_example",
)
int
str
Full name of the repository, denoted as {owner}/{repo}
int
bool
int
int
Optional[str]
/integrations/data/github/pull_requests/commits
post
carbon.github.get_pr_files
Pr Files
get_pr_files_response = carbon.github.get_pr_files(
data_source_id=1,
repository="string_example",
pull_number=1,
include_remote_data=False,
page=1,
page_size=30,
next_cursor="string_example",
)
int
str
Full name of the repository, denoted as {owner}/{repo}
int
bool
int
int
Optional[str]
/integrations/data/github/pull_requests/files
post
carbon.github.get_pull_requests
Get Prs
get_pull_requests_response = carbon.github.get_pull_requests(
data_source_id=1,
repository="string_example",
include_remote_data=False,
page=1,
page_size=30,
next_cursor="string_example",
filters={
"state": "closed",
},
order_by="created",
order_dir="asc",
)
int
str
Full name of the repository, denoted as {owner}/{repo}
bool
int
int
Optional[str]
PullRequestFilters
PROrderBy
OrderDirV2Nullable
/integrations/data/github/pull_requests
post
carbon.integrations.cancel
Cancel Data Source Items Sync
cancel_response = carbon.integrations.cancel(
data_source_id=1,
)
int
/integrations/items/sync/cancel
post
carbon.integrations.connect_data_source
Connect Data Source
connect_data_source_response = carbon.integrations.connect_data_source(
authentication={
"source": "GOOGLE_DRIVE",
"access_token": "access_token_example",
},
sync_options={
"chunk_size": 1500,
"chunk_overlap": 20,
"skip_embedding_generation": False,
"embedding_model": "OPENAI",
"generate_sparse_vectors": False,
"prepend_filename_to_chunks": False,
"sync_files_on_connection": True,
"set_page_as_boundary": False,
"enable_file_picker": True,
"sync_source_items": True,
"incremental_sync": False,
},
)
OAuthAuthentication
, NotionAuthentication
, OneDriveAuthentication
, SharepointAuthentication
, ConfluenceAuthentication
, ZendeskAuthentication
, ZoteroAuthentication
, GitbookAuthetication
, SalesforceAuthentication
, FreskdeskAuthentication
, S3Authentication
, AzureBlobStorageAuthentication
, GithubAuthentication
, ServiceNowAuthentication
, GuruAuthentication
, GongAuthentication
]SyncOptions
/integrations/connect
post
carbon.integrations.connect_document360
You will need an access token to connect your Document360 account. To obtain an access token, follow the steps highlighted here https://apidocs.document360.com/apidocs/api-token.
connect_document360_response = carbon.integrations.connect_document360(
account_email="string_example",
access_token="string_example",
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
sync_files_on_connection=True,
request_id="string_example",
sync_source_items=True,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
data_source_tags={},
)
str
This email will be used to identify your carbon data source. It should have access to the Document360 account you wish to connect.
str
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[bool]
Optional[str]
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
FileSyncConfigNullable
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/document360
post
carbon.integrations.connect_freshdesk
Refer this article to obtain an API key https://support.freshdesk.com/en/support/solutions/articles/215517. Make sure that your API key has the permission to read solutions from your account and you are on a paid plan. Once you have an API key, you can make a request to this endpoint along with your freshdesk domain. This will trigger an automatic sync of the articles in your "solutions" tab. Additional parameters below can be used to associate data with the synced articles or modify the sync behavior.
connect_freshdesk_response = carbon.integrations.connect_freshdesk(
domain="string_example",
api_key="string_example",
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
sync_files_on_connection=True,
request_id="string_example",
sync_source_items=True,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
data_source_tags={},
)
str
str
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGeneratorsNullable
Optional[bool]
Optional[bool]
Optional[bool]
Optional[str]
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
FileSyncConfigNullable
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/freshdesk
post
carbon.integrations.connect_gitbook
You will need an access token to connect your Gitbook account. Note that the permissions will be defined by the user generating access token so make sure you have the permission to access spaces you will be syncing. Refer this article for more details https://developer.gitbook.com/gitbook-api/authentication. Additionally, you need to specify the name of organization you will be syncing data from.
connect_gitbook_response = carbon.integrations.connect_gitbook(
organization="string_example",
access_token="string_example",
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
sync_files_on_connection=True,
request_id="string_example",
sync_source_items=True,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
data_source_tags={},
)
str
str
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[bool]
Optional[str]
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
FileSyncConfigNullable
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/gitbook
post
carbon.integrations.connect_guru
You will need an access token to connect your Guru account. To obtain an access token, follow the steps highlighted here https://help.getguru.com/docs/gurus-api#obtaining-a-user-token. The username should be your Guru username.
connect_guru_response = carbon.integrations.connect_guru(
username="string_example",
access_token="string_example",
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
sync_files_on_connection=True,
request_id="string_example",
sync_source_items=True,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
data_source_tags={},
)
str
str
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[bool]
Optional[str]
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
FileSyncConfigNullable
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/guru
post
carbon.integrations.create_aws_iam_user
This endpoint can be used to connect S3 as well as Digital Ocean Spaces (S3 compatible)
For S3, create a new IAM user with permissions to:
create_aws_iam_user_response = carbon.integrations.create_aws_iam_user(
access_key="string_example",
access_key_secret="string_example",
sync_source_items=True,
endpoint_url="string_example",
data_source_tags={},
)
str
str
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
Optional[str]
You can specify a Digital Ocean endpoint URL to connect a Digital Ocean Space through this endpoint. The URL should be of format .digitaloceanspaces.com. It's not required for S3 buckets.
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/s3
post
carbon.integrations.get_oauth_url
This endpoint can be used to generate the following URLs
get_oauth_url_response = carbon.integrations.get_oauth_url(
service="BOX",
tags=None,
scope="string_example",
scopes=[],
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
zendesk_subdomain="string_example",
microsoft_tenant="string_example",
sharepoint_site_name="string_example",
confluence_subdomain="string_example",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
salesforce_domain="string_example",
sync_files_on_connection=True,
set_page_as_boundary=False,
data_source_id=1,
connecting_new_account=False,
request_id="string_example",
use_ocr=False,
parse_pdf_tables_with_ocr=False,
enable_file_picker=True,
sync_source_items=True,
incremental_sync=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
automatically_open_file_picker=True,
gong_account_email="string_example",
servicenow_credentials={
"instance_subdomain": "instance_subdomain_example",
"client_id": "client_id_example",
"client_secret": "client_secret_example",
"redirect_uri": "redirect_uri_example",
},
data_source_tags={},
)
OauthBasedConnectors
Union[bool, date, datetime, dict, float, int, list, str, None]
Optional[str]
OAuthURLRequestScopes
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGeneratorsNullable
Optional[str]
Optional[str]
Optional[str]
Optional[str]
Optional[bool]
Optional[bool]
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
Optional[str]
Optional[bool]
Used to specify whether Carbon should attempt to sync all your files automatically when authorization is complete. This is only supported for a subset of connectors and will be ignored for the rest. Supported connectors: Intercom, Zendesk, Gitbook, Confluence, Salesforce, Freshdesk
bool
Optional[int]
Used to specify a data source to sync from if you have multiple connected. It can be skipped if you only have one data source of that type connected or are connecting a new account.
Optional[bool]
Used to connect a new data source. If not specified, we will attempt to create a sync URL for an existing data source based on type and ID.
Optional[str]
This request id will be added to all files that get synced using the generated OAuth URL
Optional[bool]
Enable OCR for files that support it. Supported formats: pdf, png, jpg
Optional[bool]
bool
Enable integration's file picker for sources that support it. Supported sources: BOX, DROPBOX, GOOGLE_DRIVE, ONEDRIVE, SHAREPOINT
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
bool
Only sync files if they have not already been synced or if the embedding properties have changed. This flag is currently supported by ONEDRIVE, GOOGLE_DRIVE, BOX, DROPBOX, INTERCOM, GMAIL, OUTLOOK, ZENDESK, CONFLUENCE, NOTION, SHAREPOINT, SERVICENOW. It will be ignored for other data sources.
FileSyncConfigNullable
Optional[bool]
Automatically open source file picker after the OAuth flow is complete. This flag is currently supported by BOX, DROPBOX, GOOGLE_DRIVE, ONEDRIVE, SHAREPOINT. It will be ignored for other data sources.
Optional[str]
If you are connecting a Gong account, you need to input the email of the account you wish to connect. This email will be used to identify your carbon data source.
ServiceNowCredentialsNullable
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/oauth_url
post
carbon.integrations.list_confluence_pages
This endpoint has been deprecated. Use /integrations/items/list instead.
To begin listing a user's Confluence pages, at least a data_source_id
of a connected
Confluence account must be specified. This base request returns a list of root pages for
every space the user has access to in a Confluence instance. To traverse further down
the user's page directory, additional requests to this endpoint can be made with the same
data_source_id
and with parent_id
set to the id of page from a previous request. For
convenience, the has_children
property in each directory item in the response list will
flag which pages will return non-empty lists of pages when set as the parent_id
.
list_confluence_pages_response = carbon.integrations.list_confluence_pages(
data_source_id=1,
parent_id="string_example",
)
int
Optional[str]
/integrations/confluence/list
post
carbon.integrations.list_conversations
List all of your public and private channels, DMs, and Group DMs. The ID from response
can be used as a filter to sync messages to Carbon
types: Comma separated list of types. Available types are im (DMs), mpim (group DMs), public_channel, and private_channel.
Defaults to public_channel.
cursor: Used for pagination. If next_cursor is returned in response, you need to pass it as the cursor in the next request
data_source_id: Data source needs to be specified if you have linked multiple slack accounts
exclude_archived: Should archived conversations be excluded, defaults to true
list_conversations_response = carbon.integrations.list_conversations(
types="public_channel",
cursor="string_example",
data_source_id=1,
exclude_archived=True,
)
str
Optional[str]
Optional[int]
bool
/integrations/slack/conversations
get
carbon.integrations.list_data_source_items
List Data Source Items
list_data_source_items_response = carbon.integrations.list_data_source_items(
data_source_id=1,
parent_id="string_example",
filters={},
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="name",
order_dir="asc",
)
int
Optional[str]
ListItemsFiltersNullable
Pagination
ExternalSourceItemsOrderBy
OrderDirV2
/integrations/items/list
post
carbon.integrations.list_folders
After connecting your Outlook account, you can use this endpoint to list all of your folders on outlook. This includes both system folders like "inbox" and user created folders.
list_folders_response = carbon.integrations.list_folders(
data_source_id=1,
)
Optional[int]
/integrations/outlook/user_folders
get
carbon.integrations.list_gitbook_spaces
After connecting your Gitbook account, you can use this endpoint to list all of your spaces under current organization.
list_gitbook_spaces_response = carbon.integrations.list_gitbook_spaces(
data_source_id=1,
)
int
/integrations/gitbook/spaces
get
carbon.integrations.list_labels
After connecting your Gmail account, you can use this endpoint to list all of your labels. User created labels will have the type "user" and Gmail's default labels will have the type "system"
list_labels_response = carbon.integrations.list_labels(
data_source_id=1,
)
Optional[int]
/integrations/gmail/user_labels
get
carbon.integrations.list_outlook_categories
After connecting your Outlook account, you can use this endpoint to list all of your categories on outlook. We currently support listing up to 250 categories.
list_outlook_categories_response = carbon.integrations.list_outlook_categories(
data_source_id=1,
)
Optional[int]
/integrations/outlook/user_categories
get
carbon.integrations.list_repos
Once you have connected your GitHub account, you can use this endpoint to list the repositories your account has access to. You can use a data source ID or username to fetch from a specific account.
list_repos_response = carbon.integrations.list_repos(
per_page=30,
page=1,
data_source_id=1,
)
int
int
Optional[int]
/integrations/github/repos
get
carbon.integrations.list_sharepoint_sites
List all Sharepoint sites in the connected tenant. The site names from the response can be used as the site name when connecting a Sharepoint site. If site name is null in the response, then site name should be left null when connecting to the site.
This endpoint requires an additional Sharepoint scope: "Sites.Read.All". Include this scope along with the default Sharepoint scopes to list Sharepoint sites, connect to a site, and finally sync files from the site. The default Sharepoint scopes are: [o, p, e, n, i, d, , o, f, f, l, i, n, e, _, a, c, c, e, s, s, , U, s, e, r, ., R, e, a, d, , F, i, l, e, s, ., R, e, a, d, ., A, l, l].
data_soure_id: Data source needs to be specified if you have linked multiple Sharepoint accounts cursor: Used for pagination. If next_cursor is returned in response, you need to pass it as the cursor in the next request
list_sharepoint_sites_response = carbon.integrations.list_sharepoint_sites(
data_source_id=1,
cursor="string_example",
)
Optional[int]
Optional[str]
/integrations/sharepoint/sites/list
get
carbon.integrations.sync_azure_blob_files
After optionally loading the items via /integrations/items/sync and integrations/items/list, use the container name and file name as the ID in this endpoint to sync them into Carbon. Additional parameters below can associate data with the selected items or modify the sync behavior
sync_azure_blob_files_response = carbon.integrations.sync_azure_blob_files(
ids=[{}],
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
set_page_as_boundary=False,
data_source_id=1,
request_id="string_example",
use_ocr=False,
parse_pdf_tables_with_ocr=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
)
AzureBlobGetFileInput
]Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
Optional[int]
Optional[str]
Optional[bool]
Optional[bool]
FileSyncConfigNullable
/integrations/azure_blob_storage/files
post
carbon.integrations.sync_azure_blob_storage
This endpoint can be used to connect Azure Blob Storage.
For Azure Blob Storage, follow these steps:
Once created, provide us with the following details to generate the connection URL:
sync_azure_blob_storage_response = carbon.integrations.sync_azure_blob_storage(
account_name="string_example",
account_key="string_example",
sync_source_items=True,
data_source_tags={},
)
str
str
bool
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/azure_blob_storage
post
carbon.integrations.sync_confluence
This endpoint has been deprecated. Use /integrations/files/sync instead.
After listing pages in a user's Confluence account, the set of selected page ids
and the
connected account's data_source_id
can be passed into this endpoint to sync them into
Carbon. Additional parameters listed below can be used to associate data to the selected
pages or alter the behavior of the sync.
sync_confluence_response = carbon.integrations.sync_confluence(
data_source_id=1,
ids=["string_example"],
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
set_page_as_boundary=False,
request_id="string_example",
use_ocr=False,
parse_pdf_tables_with_ocr=False,
incremental_sync=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
)
int
List[str]
, List[SyncFilesIds
]]Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGeneratorsNullable
Optional[bool]
Optional[bool]
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
Optional[str]
Optional[bool]
Optional[bool]
bool
Only sync files if they have not already been synced or if the embedding properties have changed. This flag is currently supported by ONEDRIVE, GOOGLE_DRIVE, BOX, DROPBOX, INTERCOM, GMAIL, OUTLOOK, ZENDESK, CONFLUENCE, NOTION, SHAREPOINT, SERVICENOW. It will be ignored for other data sources.
FileSyncConfigNullable
/integrations/confluence/sync
post
carbon.integrations.sync_data_source_items
Sync Data Source Items
sync_data_source_items_response = carbon.integrations.sync_data_source_items(
data_source_id=1,
)
int
/integrations/items/sync
post
carbon.integrations.sync_files
After listing files and folders via /integrations/items/sync and integrations/items/list, use the selected items' external ids as the ids in this endpoint to sync them into Carbon. Sharepoint items take an additional parameter root_id, which identifies the drive the file or folder is in and is stored in root_external_id. That additional paramter is optional and excluding it will tell the sync to assume the item is stored in the default Documents drive.
sync_files_response = carbon.integrations.sync_files(
data_source_id=1,
ids=["string_example"],
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
set_page_as_boundary=False,
request_id="string_example",
use_ocr=False,
parse_pdf_tables_with_ocr=False,
incremental_sync=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
)
int
List[str]
, List[SyncFilesIds
]]Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGeneratorsNullable
Optional[bool]
Optional[bool]
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
Optional[str]
Optional[bool]
Optional[bool]
bool
Only sync files if they have not already been synced or if the embedding properties have changed. This flag is currently supported by ONEDRIVE, GOOGLE_DRIVE, BOX, DROPBOX, INTERCOM, GMAIL, OUTLOOK, ZENDESK, CONFLUENCE, NOTION, SHAREPOINT, SERVICENOW. It will be ignored for other data sources.
FileSyncConfigNullable
/integrations/files/sync
post
carbon.integrations.sync_git_hub
Refer this article to obtain an access token https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens. Make sure that your access token has the permission to read content from your desired repos. Note that if your access token expires you will need to manually update it through this endpoint.
sync_git_hub_response = carbon.integrations.sync_git_hub(
username="string_example",
access_token="string_example",
sync_source_items=False,
data_source_tags={},
)
str
str
bool
Enabling this flag will fetch all available content from the source to be listed via list items endpoint
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/github
post
carbon.integrations.sync_gitbook
You can sync upto 20 Gitbook spaces at a time using this endpoint. Additional parameters below can be used to associate data with the synced pages or modify the sync behavior.
sync_gitbook_response = carbon.integrations.sync_gitbook(
space_ids=["string_example"],
data_source_id=1,
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
request_id="string_example",
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
)
GitbookSyncRequestSpaceIds
int
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[str]
FileSyncConfigNullable
/integrations/gitbook/sync
post
carbon.integrations.sync_gmail
Once you have successfully connected your gmail account, you can choose which emails to sync with us using the filters parameter. Filters is a JSON object with key value pairs. It also supports AND and OR operations. For now, we support a limited set of keys listed below.
label: Inbuilt Gmail labels, for example "Important" or a custom label you created.
after or before: A date in YYYY/mm/dd format (example 2023/12/31). Gets emails after/before a certain date.
You can also use them in combination to get emails from a certain period.
is: Can have the following values - starred, important, snoozed, and unread
from: Email address of the sender
to: Email address of the recipient
in: Can have the following values - sent (sync emails sent by the user)
has: Can have the following values - attachment (sync emails that have attachments)
Using keys or values outside of the specified values can lead to unexpected behaviour.
An example of a basic query with filters can be
{
"filters": {
"key": "label",
"value": "Test"
}
}
Which will list all emails that have the label "Test".
You can use AND and OR operation in the following way:
{
"filters": {
"AND": [
{
"key": "after",
"value": "2024/01/07"
},
{
"OR": [
{
"key": "label",
"value": "Personal"
},
{
"key": "is",
"value": "starred"
}
]
}
]
}
}
This will return emails after 7th of Jan that are either starred or have the label "Personal". Note that this is the highest level of nesting we support, i.e. you can't add more AND/OR filters within the OR filter in the above example.
sync_gmail_response = carbon.integrations.sync_gmail(
filters={},
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
data_source_id=1,
request_id="string_example",
sync_attachments=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
incremental_sync=False,
)
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[int]
Optional[str]
Optional[bool]
FileSyncConfigNullable
bool
/integrations/gmail/sync
post
carbon.integrations.sync_outlook
Once you have successfully connected your Outlook account, you can choose which emails to sync with us
using the filters and folder parameter. "folder" should be the folder you want to sync from Outlook. By default
we get messages from your inbox folder.
Filters is a JSON object with key value pairs. It also supports AND and OR operations.
For now, we support a limited set of keys listed below.
category: Custom categories that you created in Outlook.
after or before: A date in YYYY/mm/dd format (example 2023/12/31). Gets emails after/before a certain date. You can also use them in combination to get emails from a certain period.
is: Can have the following values: flagged
from: Email address of the sender
An example of a basic query with filters can be
{
"filters": {
"key": "category",
"value": "Test"
}
}
Which will list all emails that have the category "Test".
Specifying a custom folder in the same query
{
"folder": "Folder Name",
"filters": {
"key": "category",
"value": "Test"
}
}
You can use AND and OR operation in the following way:
{
"filters": {
"AND": [
{
"key": "after",
"value": "2024/01/07"
},
{
"OR": [
{
"key": "category",
"value": "Personal"
},
{
"key": "category",
"value": "Test"
},
]
}
]
}
}
This will return emails after 7th of Jan that have either Personal or Test as category. Note that this is the highest level of nesting we support, i.e. you can't add more AND/OR filters within the OR filter in the above example.
sync_outlook_response = carbon.integrations.sync_outlook(
filters={},
tags={},
folder="Inbox",
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
data_source_id=1,
request_id="string_example",
sync_attachments=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
incremental_sync=False,
)
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[str]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[int]
Optional[str]
Optional[bool]
FileSyncConfigNullable
bool
/integrations/outlook/sync
post
carbon.integrations.sync_repos
You can retreive repos your token has access to using /integrations/github/repos and sync their content. You can also pass full name of any public repository (username/repo-name). This will store the repo content with carbon which can be accessed through /integrations/items/list endpoint. Maximum of 25 repositories are accepted per request.
sync_repos_response = carbon.integrations.sync_repos(
repos=["string_example"],
data_source_id=1,
)
GithubFetchReposRequestRepos
Optional[int]
/integrations/github/sync_repos
post
carbon.integrations.sync_rss_feed
Rss Feed
sync_rss_feed_response = carbon.integrations.sync_rss_feed(
url="string_example",
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
request_id="string_example",
data_source_tags={},
)
str
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[str]
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
Tags to be associated with the data source. If the data source already has tags set, then an upsert will be performed.
/integrations/rss_feed
post
carbon.integrations.sync_s3_files
After optionally loading the items via /integrations/items/sync and integrations/items/list, use the bucket name and object key as the ID in this endpoint to sync them into Carbon. Additional parameters below can associate data with the selected items or modify the sync behavior
sync_s3_files_response = carbon.integrations.sync_s3_files(
ids=[{}],
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
max_items_per_chunk=1,
set_page_as_boundary=False,
data_source_id=1,
request_id="string_example",
use_ocr=False,
parse_pdf_tables_with_ocr=False,
file_sync_config={
"auto_synced_source_types": ["ARTICLE"],
"sync_attachments": False,
"detect_audio_language": False,
"transcription_service": "assemblyai",
"include_speaker_labels": False,
"split_rows": False,
"generate_chunks_only": False,
"store_file_only": False,
"skip_file_processing": False,
"parsed_text_format": "PLAIN_TEXT",
},
)
S3GetFileInput
]Each input should be one of the following: A bucket name, a bucket name and a prefix, or a bucket name and an object key. A prefix is the common path for all objects you want to sync. Paths should end with a forward slash.
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[int]
Number of objects per chunk. For csv, tsv, xlsx, and json files only.
bool
Optional[int]
Optional[str]
Optional[bool]
Optional[bool]
FileSyncConfigNullable
/integrations/s3/files
post
carbon.integrations.sync_slack
You can list all conversations using the endpoint /integrations/slack/conversations. The ID of conversation will be used as an input for this endpoint with timestamps as optional filters.
sync_slack_response = carbon.integrations.sync_slack(
filters={
"conversation_id": "conversation_id_example",
},
tags={},
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
embedding_model="OPENAI",
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
data_source_id=1,
request_id="string_example",
)
SlackFilters
Optional[Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]]
Optional[int]
Optional[int]
Optional[bool]
EmbeddingGenerators
Optional[bool]
Optional[bool]
Optional[int]
Optional[str]
/integrations/slack/sync
post
carbon.organizations.get
Get Organization
get_response = carbon.organizations.get()
/organization
get
carbon.organizations.update
Update Organization
update_response = carbon.organizations.update(
global_user_config={},
data_source_configs={
"key": {
"allowed_file_formats": [],
},
},
)
UserConfigurationNullable
UpdateOrganizationInputDataSourceConfigs
/organization/update
post
carbon.organizations.update_stats
Use this endpoint to reaggregate the statistics for an organization, for example aggregate_file_size. The reaggregation process is asyncronous so a webhook will be sent with the event type being FILE_STATISTICS_AGGREGATED to notify when the process is complee. After this aggregation is complete, the updated statistics can be retrieved using the /organization endpoint. The response of /organization willalso contain a timestamp of the last time the statistics were reaggregated.
update_stats_response = carbon.organizations.update_stats()
/organization/statistics
post
carbon.users.delete
Delete Users
delete_response = carbon.users.delete(
customer_ids=["string_example"],
)
DeleteUsersInputCustomerIds
/delete_users
post
carbon.users.get
User Endpoint
get_response = carbon.users.get(
customer_id="string_example",
)
str
/user
post
carbon.users.list
List users within an organization
list_response = carbon.users.list(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
filters={},
order_by="created_at",
order_dir="asc",
include_count=False,
)
Pagination
ListUsersFilters
ListUsersOrderByTypes
OrderDirV2
bool
/list_users
post
carbon.users.toggle_user_features
Toggle User Features
toggle_user_features_response = carbon.users.toggle_user_features(
configuration_key_name="sparse_vectors",
value={},
)
ConfigurationKeys
Dict[str, Union[bool, date, datetime, dict, float, int, list, str, None]]
/modify_user_configuration
post
carbon.users.update_users
Update Users
update_users_response = carbon.users.update_users(
customer_ids=["string_example"],
auto_sync_enabled_sources=["string_example"],
max_files=-1,
max_files_per_upload=-1,
max_characters=-1,
max_characters_per_file=-1,
max_characters_per_upload=-1,
auto_sync_interval=-1,
)
UpdateUsersInputCustomerIds
DataSourceType
], DataSourceExtendedInput
]List of data source types to enable auto sync for. Empty array will remove all sources and the string \"ALL\" will enable it for all data sources
Optional[int]
Custom file upload limit for the user over all user's files across all uploads. If set, then the user will not be allowed to upload more files than this limit. If not set, or if set to -1, then the user will have no limit.
Optional[int]
Custom file upload limit for the user across a single upload. If set, then the user will not be allowed to upload more files than this limit in a single upload. If not set, or if set to -1, then the user will have no limit.
Optional[int]
Custom character upload limit for the user over all user's files across all uploads. If set, then the user will not be allowed to upload more characters than this limit. If not set, or if set to -1, then the user will have no limit.
Optional[int]
A single file upload from the user can not exceed this character limit. If set, then the file will not be synced if it exceeds this limit. If not set, or if set to -1, then the user will have no limit.
Optional[int]
Custom character upload limit for the user across a single upload. If set, then the user won't be able to sync more than this many characters in one upload. If not set, or if set to -1, then the user will have no limit.
Optional[int]
The interval in hours at which the user's data sources should be synced. If not set or set to -1, the user will be synced at the organization level interval or default interval if that is also not set. Must be one of [3, 6, 12, 24]
/update_users
post
carbon.users.who_am_i
Me Endpoint
who_am_i_response = carbon.users.who_am_i()
/whoami
get
carbon.utilities.fetch_urls
Extracts all URLs from a webpage.
Args: url (str): URL of the webpage
Returns: FetchURLsResponse: A response object with a list of URLs extracted from the webpage and the webpage content.
fetch_urls_response = carbon.utilities.fetch_urls(
url="url_example",
)
str
/fetch_urls
get
carbon.utilities.fetch_webpage
Fetch Urls V2
fetch_webpage_response = carbon.utilities.fetch_webpage(
url="string_example",
)
str
/fetch_webpage
post
carbon.utilities.fetch_youtube_transcripts
Fetches english transcripts from YouTube videos.
Args: id (str): The ID of the YouTube video. raw (bool): Whether to return the raw transcript or not. Defaults to False.
Returns: dict: A dictionary with the transcript of the YouTube video.
fetch_youtube_transcripts_response = carbon.utilities.fetch_youtube_transcripts(
id="id_example",
raw=False,
)
str
bool
/fetch_youtube_transcript
get
carbon.utilities.process_sitemap
Retrieves all URLs from a sitemap, which can subsequently be utilized with our web_scrape
endpoint.
process_sitemap_response = carbon.utilities.process_sitemap(
url="url_example",
)
str
/process_sitemap
get
carbon.utilities.scrape_sitemap
Extracts all URLs from a sitemap and performs a web scrape on each of them.
Args: sitemap_url (str): URL of the sitemap
Returns: dict: A response object with the status of the scraping job message.-->
scrape_sitemap_response = carbon.utilities.scrape_sitemap(
url="string_example",
tags={
"key": "string_example",
},
max_pages_to_scrape=1,
chunk_size=1500,
chunk_overlap=20,
skip_embedding_generation=False,
enable_auto_sync=False,
generate_sparse_vectors=False,
prepend_filename_to_chunks=False,
html_tags_to_skip=[],
css_classes_to_skip=[],
css_selectors_to_skip=[],
embedding_model="OPENAI",
url_paths_to_include=[],
url_paths_to_exclude=[],
urls_to_scrape=[],
download_css_and_media=False,
generate_chunks_only=False,
store_file_only=False,
use_premium_proxies=False,
)
str
SitemapScrapeRequestTags
Optional[int]
Optional[int]
Optional[int]
Optional[bool]
Optional[bool]
Optional[bool]
Optional[bool]
SitemapScrapeRequestHtmlTagsToSkip
SitemapScrapeRequestCssClassesToSkip
SitemapScrapeRequestCssSelectorsToSkip
EmbeddingGenerators
SitemapScrapeRequestUrlPathsToInclude
SitemapScrapeRequestUrlPathsToExclude
SitemapScrapeRequestUrlsToScrape
Optional[bool]
Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
bool
If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.
bool
If this flag is enabled, the file will be stored with Carbon, but no processing will be done.
bool
If the default proxies are blocked and not returning results, this flag can be enabled to use alternate proxies (residential and office). Scrapes might take longer to finish with this flag enabled.
/scrape_sitemap
post
carbon.utilities.scrape_web
Conduct a web scrape on a given webpage URL. Our web scraper is fully compatible with JavaScript and supports recursion depth, enabling you to efficiently extract all content from the target website.
scrape_web_response = carbon.utilities.scrape_web(
body=[
{
"url": "url_example",
"recursion_depth": 3,
"max_pages_to_scrape": 100,
"chunk_size": 1500,
"chunk_overlap": 20,
"skip_embedding_generation": False,
"enable_auto_sync": False,
"generate_sparse_vectors": False,
"prepend_filename_to_chunks": False,
"html_tags_to_skip": [],
"css_classes_to_skip": [],
"css_selectors_to_skip": [],
"embedding_model": "OPENAI",
"url_paths_to_include": [],
"download_css_and_media": False,
"generate_chunks_only": False,
"store_file_only": False,
"use_premium_proxies": False,
}
],
)
/web_scrape
post
carbon.utilities.search_urls
Perform a web search and obtain a list of relevant URLs.
As an illustration, when you perform a search for “content related to MRNA,” you will receive a list of links such as the following:
- https://tomrenz.substack.com/p/mrna-and-why-it-matters
- https://www.statnews.com/2020/11/10/the-story-of-mrna-how-a-once-dismissed-idea-became-a-leading-technology-in-the-covid-vaccine-race/
- https://www.statnews.com/2022/11/16/covid-19-vaccines-were-a-success-but-mrna-still-has-a-delivery-problem/
- https://joomi.substack.com/p/were-still-being-misled-about-how
Subsequently, you can submit these links to the web_scrape endpoint in order to retrieve the content of the respective web pages.
Args: query (str): Query to search for
Returns: FetchURLsResponse: A response object with a list of URLs for a given search query.
search_urls_response = carbon.utilities.search_urls(
query="query_example",
)
str
/search_urls
get
carbon.utilities.user_webpages
User Web Pages
user_webpages_response = carbon.utilities.user_webpages(
filters={},
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="asc",
)
UserWebPagesFilters
Pagination
UserWebPageOrderByTypes
OrderDirV2
/user_webpages
post
carbon.webhooks.add_url
Add Webhook Url
add_url_response = carbon.webhooks.add_url(
url="string_example",
)
str
/add_webhook
post
carbon.webhooks.delete_url
Delete Webhook Url
delete_url_response = carbon.webhooks.delete_url(
webhook_id=1,
)
int
/delete_webhook/{webhook_id}
delete
carbon.webhooks.urls
Webhook Urls
urls_response = carbon.webhooks.urls(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"ids": [],
},
)
Pagination
WebhookOrderByColumns
OrderDir
WebhookFilters
/webhooks
post
carbon.white_label.create
Create White Labels
create_response = carbon.white_label.create(
body=[
{
"data_source_type": "GOOGLE_DRIVE",
"credentials": {
"client_id": "client_id_example",
"redirect_uri": "redirect_uri_example",
},
}
],
)
/white_label/create
post
carbon.white_label.delete
Delete White Labels
delete_response = carbon.white_label.delete(
ids=[1],
)
DeleteWhiteLabelRequestIds
/white_label/delete
post
carbon.white_label.list
List White Labels
list_response = carbon.white_label.list(
pagination={
"limit": 10,
"offset": 0,
"starting_id": 0,
},
order_by="created_at",
order_dir="desc",
filters={
"ids": [],
"data_source_type": [],
},
)
Pagination
WhiteLabelOrderByColumns
OrderDir
WhiteLabelFilters
/white_label/list
post
carbon.white_label.update
Update White Label
update_response = carbon.white_label.update(
body={
"data_source_type": "GOOGLE_DRIVE",
"credentials": {
"client_id": "client_id_example",
"redirect_uri": "redirect_uri_example",
},
},
data_source_type="INTERCOM",
credentials={
"client_id": "client_id_example",
"redirect_uri": "redirect_uri_example",
},
)
str
Credentials
/white_label/update
post
This Python package is automatically generated by Konfig
FAQs
Client for Carbon
We found that carbon-python-sdk demonstrated a healthy version release cadence and project activity because the last version was released less than a year ago. It has 2 open source maintainers collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
Research
The Socket Research Team breaks down a malicious wrapper package that uses obfuscation to harvest credentials and exfiltrate sensitive data.
Research
Security News
Attackers used a malicious npm package typosquatting a popular ESLint plugin to steal sensitive data, execute commands, and exploit developer systems.
Security News
The Ultralytics' PyPI Package was compromised four times in one weekend through GitHub Actions cache poisoning and failure to rotate previously compromised API tokens.