
Security News
Potemkin Understanding in LLMs: New Study Reveals Flaws in AI Benchmarks
New research reveals that LLMs often fake understanding, passing benchmarks but failing to apply concepts or stay internally consistent.
A powerful Python library for intelligent text processing, question generation, and answer generation for LLM fine-tuning datasets
TextFission 是一个强大的文本处理工具,用于将长文本分割成小块,并生成相关的问题和答案。它支持多种语言,提供智能的文本分割策略,并生成高质量的问题和答案。
pip install textfission
如果遇到依赖冲突,特别是numpy版本冲突,请尝试以下解决方案:
pip install "numpy>=1.21.0,<2.0.0"
pip install textfission
python -m venv textfission-env
source textfission-env/bin/activate # Linux/Mac
# 或
textfission-env\Scripts\activate # Windows
pip install textfission
conda create -n textfission python=3.11
conda activate textfission
pip install textfission
依赖冲突错误:如果遇到类似以下错误:
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed.
请参考 安装指南 中的详细解决方案。
from textfission import create_dataset, Config, ModelConfig
# 创建配置
config = Config(
model_settings=ModelConfig(
api_key="your-api-key",
model="gpt-3.5-turbo"
)
)
# 处理文本
text = "你的长文本内容..."
result = create_dataset(text, config, "output/dataset.json")
from textfission import create_dataset, Config, ModelConfig
# 创建DeepSeek配置
config = Config(
model_settings=ModelConfig(
api_key="your-deepseek-api-key",
model="deepseek-chat",
api_base_url="https://api.deepseek.com/v1" # DeepSeek API端点
)
)
# 处理文本
text = "你的长文本内容..."
result = create_dataset(text, config, "output/deepseek_dataset.json")
from textfission import create_dataset, Config, ModelConfig
# 创建通义千问配置
config = Config(
model_settings=ModelConfig(
api_key="your-qianwen-api-key",
model="qwen-turbo"
)
)
# 处理文本
text = "你的长文本内容..."
result = create_dataset(text, config, "output/qianwen_dataset.json")
config = {
"model_settings": {
"api_key": "your-api-key",
"model": "gpt-3.5-turbo",
"temperature": 0.7,
"max_tokens": 2000,
"api_base_url": None # 可选:自定义API端点
},
"processing_config": {
"max_workers": 4,
"batch_size": 10,
"timeout": 30
},
"export_config": {
"format": "json",
"output_dir": "output"
},
"custom_config": {
"language": "zh",
"min_confidence": 0.7,
"min_quality": "good"
}
}
api_base_url
参数支持OPENAI_API_KEY=your-api-key
MODEL_NAME=gpt-3.5-turbo
LANGUAGE=zh
MAX_WORKERS=4
BATCH_SIZE=10
from textfission import ModelFactory, Config
# 自动推断模型类型
config = Config(...)
model = ModelFactory.create_model(config)
# 手动指定模型类型
model = ModelFactory.create_model(config, model_type="openai")
# 查看支持的模型
supported_models = ModelFactory.get_supported_models()
print(supported_models)
from textfission import ModelFactory
from textfission.models.base import BaseModel
class CustomModel(BaseModel):
def generate(self, prompt: str) -> str:
# 实现生成逻辑
pass
def get_embedding(self, text: str) -> list:
# 实现嵌入逻辑
pass
def count_tokens(self, text: str) -> int:
# 实现token计数逻辑
pass
# 注册模型
ModelFactory.register_model("custom", CustomModel)
ModelFactory.register_model_name("my-model", "custom")
from textfission.processors import SmartTextSplitter
splitter = SmartTextSplitter(
chunk_size=1000,
chunk_overlap=200,
language="zh"
)
chunks = splitter.split(text)
from textfission.processors import QuestionGenerator
generator = QuestionGenerator(
max_questions_per_chunk=5,
min_questions_per_chunk=2,
question_types=["factual", "inferential"]
)
questions = generator.generate(chunk)
from textfission.processors import AnswerGenerator
generator = AnswerGenerator(
min_confidence=0.7,
min_quality="good"
)
answer = generator.generate(chunk, question)
from textfission.core import CacheManager
cache = CacheManager.get_instance()
cache.setup(
max_size=1000,
default_ttl=3600,
cache_dir="cache"
)
# 使用缓存
result = cache.get_or_set(
key="unique_key",
default_func=lambda: process_text(text)
)
from textfission.core import ErrorHandler, ErrorCodes
@ErrorHandler.retry_on_error(
max_attempts=3,
delay=1.0,
error_codes=[ErrorCodes.API_ERROR]
)
def process_with_retry():
# 你的处理代码
pass
# 启用并行处理
config = {
"model_settings": {
"use_parallel": True,
"api_keys": ["key1", "key2"],
"models": ["model1", "model2"]
}
}
# 批量处理
results = tf.process_batch(texts, batch_size=10)
# 配置缓存
config = {
"processing_config": {
"cache_size": 1000,
"cache_ttl": 3600
}
}
{
"chunks": [
{
"text": "文本块内容",
"metadata": {
"language": "zh",
"length": 1000
}
}
],
"questions": [
{
"text": "问题内容",
"type": "factual",
"difficulty": 0.7,
"keywords": ["关键词1", "关键词2"]
}
],
"answers": [
{
"text": "答案内容",
"metadata": {
"quality": "good",
"confidence": 0.9,
"citations": [
{
"text": "引用文本",
"position": "位置"
}
]
}
}
]
}
chunk_id,chunk_text,question_id,question_text,answer_text,quality,confidence
1,文本块1,1,问题1,答案1,good,0.9
1,文本块1,2,问题2,答案2,excellent,0.95
try:
result = tf.process(text)
except TextFissionError as e:
print(f"Error: {e.message}")
print(f"Error code: {e.error_code}")
print(f"Details: {e.details}")
from textfission.core import Logger
logger = Logger.get_instance()
logger.setup(
name="textfission",
level=logging.INFO,
log_file="logs/textfission.log"
)
logger.info("Processing started", text_length=len(text))
logger.error("Processing failed", error=str(e))
MIT License
FAQs
A powerful Python library for intelligent text processing, question generation, and answer generation for LLM fine-tuning datasets
We found that textfission demonstrated a healthy version release cadence and project activity because the last version was released less than a year ago. It has 1 open source maintainer collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
New research reveals that LLMs often fake understanding, passing benchmarks but failing to apply concepts or stay internally consistent.
Security News
Django has updated its security policies to reject AI-generated vulnerability reports that include fabricated or unverifiable content.
Security News
ECMAScript 2025 introduces Iterator Helpers, Set methods, JSON modules, and more in its latest spec update approved by Ecma in June 2025.