Python Transformer认知扭曲检测教程:心理健康NLP应用
概述
认知扭曲是CBT(认知行为疗法)的核心概念,指不理性、不准确的思想模式,常见于焦虑、抑郁等心理健康问题。
常见认知扭曲类型:
- 非黑即白思维:全有或全无
- 灾难化思维:预期最坏结果
- 过度概括:基于单一事件得出广泛结论
- 情绪化推理:相信感觉即事实
- 应该陈述:对他人/自己强加不切实际的期望
本教程将教你使用Transformer模型自动识别这些认知扭曲。
技术栈
code
# 核心依赖
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from datasets import Dataset
import pandas as pd
import numpy as np
from typing import List, Dict
Code collapsed
数据准备
认知扭曲数据集
code
# 创建认知扭曲示例数据
COGNITIVE_DISTORTIONS_DATA = {
"all_or_nothing": [
"我是个彻底的失败者",
"如果我不能做到完美,那我就是一无是处",
"要么成功,要么什么都不是",
"犯了一个错误,我整个人都毁了"
],
"catastrophizing": [
"如果这次面试失败,我的人生就完了",
"那个头痛肯定是脑瘤",
"如果我考试不及格,就永远找不到工作",
"一件小事出错,一切都会崩溃"
],
"overgeneralization": [
"我总是做不好任何事情",
"没有人会喜欢我",
"每次我尝试都会失败",
"我永远学不会这个"
],
"emotional_reasoning": [
"我感觉自己很蠢,所以我肯定很蠢",
"我觉得危险,所以肯定不安全",
"我感到内疚,所以我一定做错了什么",
"我觉得没人爱我,这是事实"
],
"should_statements": [
"我应该总是做到最好",
"我不应该犯错",
"别人应该像我对待他们那样对待我",
"我应该能控制一切"
],
"labeling": [
"我是个失败者",
"他是个十足的坏人",
"我是个白痴",
"我是个毫无价值的人"
],
"mind_reading": [
"我知道他在想什么不好的事",
"他们肯定觉得我很奇怪",
"她肯定看不起我",
"所有人都在嘲笑我"
],
"neutral": [ # 健康思维
"我遇到了一些挫折,但我可以从中学习",
"面试结果不确定,但我做了准备",
"我犯了个错误,但每个人都会犯错",
"今天心情不好,但这很正常"
]
}
def create_training_data():
"""创建训练数据集"""
data = []
for distortion_type, examples in COGNITIVE_DISTORTIONS_DATA.items():
for example in examples:
data.append({
"text": example,
"label": distortion_type
})
return pd.DataFrame(data)
# 创建数据集
train_df = create_training_data()
print(f"总样本数: {len(train_df)}")
print(f"类别分布:\n{train_df['label'].value_counts()}")
Code collapsed
数据预处理
code
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
# 初始化tokenizer
MODEL_NAME = "bert-base-chinese" # 或 "bert-base-uncased" for English
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def preprocess_text(text: str, max_length: int = 128):
"""预处理文本"""
# 清理文本
text = text.strip()
# Tokenize
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return encoding
def prepare_dataset(df: pd.DataFrame):
"""准备数据集"""
# 划分训练集和验证集
train_df, val_df = train_test_split(
df,
test_size=0.2,
random_state=42,
stratify=df['label']
)
# 创建标签映射
label2id = {label: idx for idx, label in enumerate(df['label'].unique())}
id2label = {idx: label for label, idx in label2id.items()}
# 转换标签
train_df['label_id'] = train_df['label'].map(label2id)
val_df['label_id'] = val_df['label'].map(label2id)
return train_df, val_df, label2id, id2label
train_df, val_df, label2id, id2label = prepare_dataset(train_df)
# 创建Hugging Face Dataset
class CognitiveDistortionDataset(torch.utils.data.Dataset):
def __init__(self, df, tokenizer):
self.df = df
self.tokenizer = tokenizer
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
encoding = self.tokenizer(
row['text'],
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(row['label_id'], dtype=torch.long)
}
train_dataset = CognitiveDistortionDataset(train_df, tokenizer)
val_dataset = CognitiveDistortionDataset(val_df, tokenizer)
Code collapsed
模型训练
模型架构
code
from transformers import AutoModelForSequenceClassification
# 加载预训练模型
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=len(label2id),
id2label=id2label,
label2id=label2id
)
# 查看模型结构
print(model)
Code collapsed
训练配置
code
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
"""计算评估指标"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='weighted'
)
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
# 训练参数
training_args = TrainingArguments(
output_dir='./cognitive-distortion-detector',
num_train_epochs=10,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='f1',
greater_is_better=True
)
# 创建Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# 开始训练
trainer.train()
Code collapsed
模型评估
详细评估
code
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# 在验证集上评估
eval_results = trainer.evaluate()
print("验证集结果:", eval_results)
# 生成预测
predictions = trainer.predict(val_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
# 分类报告
print("\n分类报告:")
print(classification_report(
y_true, y_pred,
target_names=list(label2id.keys())
))
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(
cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=list(label2id.keys()),
yticklabels=list(label2id.keys())
)
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('认知扭曲分类混淆矩阵')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
Code collapsed
错误分析
code
def analyze_misclassifications(val_df, y_true, y_pred, id2label):
"""分析错误分类"""
misclassified = []
for idx, (true_label, pred_label) in enumerate(zip(y_true, y_pred)):
if true_label != pred_label:
row = val_df.iloc[idx]
misclassified.append({
'text': row['text'],
'true_label': id2label[true_label],
'pred_label': id2label[pred_label]
})
return pd.DataFrame(misclassified)
errors = analyze_misclassifications(val_df, y_true, y_pred, id2label)
print("错误分类示例:")
print(errors.head(10))
Code collapsed
实时检测系统
推理管道
code
class CognitiveDistortionDetector:
"""认知扭曲检测器"""
def __init__(self, model_path: str, tokenizer_name: str):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
self.id2label = self.model.config.id2label
def detect(self, text: str, return_probabilities: bool = False):
"""
检测文本中的认知扭曲
参数:
text: 输入文本
return_probabilities: 是否返回所有类别概率
返回:
{
"distortion": str, # 主要扭曲类型
"confidence": float, # 置信度
"all_probabilities": dict, # 所有类别概率
"is_distorted": bool, # 是否存在认知扭曲
}
"""
# Tokenize
inputs = self.tokenizer(
text,
return_tensors='pt',
truncation=True,
max_length=128
)
# 预测
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=-1)
# 获取预测结果
pred_class = torch.argmax(probabilities, dim=-1).item()
confidence = probabilities[0, pred_class].item()
# 构建结果
result = {
"distortion": self.id2label[pred_class],
"confidence": confidence,
"is_distorted": pred_class != self.id2label.get('neutral', 7)
}
if return_probabilities:
all_probs = {
self.id2label[i]: prob.item()
for i, prob in enumerate(probabilities[0])
}
result["all_probabilities"] = all_probs
return result
def detect_batch(self, texts: List[str]):
"""批量检测"""
results = []
for text in texts:
result = self.detect(text, return_probabilities=True)
results.append(result)
return results
# 使用检测器
detector = CognitiveDistortionDetector(
model_path='./cognitive-distortion-detector',
tokenizer_name=MODEL_NAME
)
# 单个检测
result = detector.detect("我是个彻底的失败者")
print(result)
# 输出: {'distortion': 'all_or_nothing', 'confidence': 0.92, 'is_distorted': True}
Code collapsed
CBT建议生成
code
class CBTInterventionSystem:
"""CBT干预系统"""
def __init__(self, distortion_detector: CognitiveDistortionDetector):
self.detector = distortion_detector
self.interventions = self._load_interventions()
def _load_interventions(self):
"""加载干预策略"""
return {
"all_or_nothing": {
"challenge": "这是真的吗?有没有中间地带?",
"reframe": "用'有时'或'往往'替代'总是'或'从不'",
"example": "我不是'彻底的失败者',我只是'这次没做好'"
},
"catastrophizing": {
"challenge": "最坏的情况真的那么可能发生吗?",
"reframe": "考虑最可能、最好、最坏的三种结果",
"example": "即使面试失败,我还有其他机会"
},
"overgeneralization": {
"challenge": "这个单一事件能代表所有情况吗?",
"reframe": "寻找例外情况",
"example": "我这次失败了,但我以前成功过"
},
"emotional_reasoning": {
"challenge": "感觉等同于事实吗?",
"reframe": "感觉是真实的,但不一定是事实",
"example": "我感到愚蠢,但这不代表我真的很蠢"
},
"should_statements": {
"challenge": "这个'应该'现实吗?",
"reframe": "用'希望'或'愿意'替代'应该'",
"example": "我希望做到最好,而不是我应该做到最好"
},
"labeling": {
"challenge": "一个错误能定义一个人吗?",
"reframe": "描述行为而非定义人",
"example": "我犯了个错,而不是我是个失败者"
},
"mind_reading": {
"challenge": "你怎么知道他在想什么?",
"reframe": "检查你的假设",
"example": "我不能确定他在想什么"
},
"neutral": {
"message": "你的思维模式很健康!继续保持这种平衡的思考方式。"
}
}
def analyze_and_intervene(self, text: str):
"""
分析文本并提供干预
返回:
{
"detection": dict, # 检测结果
"intervention": dict, # 干预建议
"exercises": list # 认知重构练习
}
"""
# 检测认知扭曲
detection = self.detector.detect(text, return_probabilities=True)
# 获取干预策略
distortion_type = detection["distortion"]
intervention = self.interventions.get(
distortion_type,
self.interventions["neutral"]
)
# 生成认知重构练习
exercises = self._generate_exercises(text, distortion_type)
return {
"detection": detection,
"intervention": intervention,
"exercises": exercises
}
def _generate_exercises(self, text: str, distortion_type: str):
"""生成认知重构练习"""
exercises = []
if distortion_type == "all_or_nothing":
exercises = [
"列出3个'中间地带'的例子",
"用'有时...有时...'改写原句",
"识别生活中的灰色地带"
]
elif distortion_type == "catastrophizing":
exercises = [
"评估最坏情况的实际概率(0-100%)",
"写下如果最坏情况发生,你能如何应对",
"列出3个更可能的结果"
]
elif distortion_type == "overgeneralization":
exercises = [
"寻找与概括相反的例子",
"用'有时'替代'总是'",
"记录具体的而非概括的观察"
]
else:
exercises = [
"挑战这个想法:证据是什么?",
"考虑替代解释",
"如果朋友这样想,你会怎么建议?"
]
return exercises
# 使用干预系统
cbt_system = CBTInterventionSystem(detector)
result = cbt_system.analyze_and_intervene("我总是失败,我是个失败者")
print("检测结果:", result["detection"]["distortion"])
print("\n挑战问题:", result["intervention"]["challenge"])
print("\n重构建议:", result["intervention"]["reframe"])
print("\n练习:")
for exercise in result["exercises"]:
print(f" - {exercise}")
Code collapsed
应用集成
FastAPI服务
code
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI(title="认知扭曲检测API")
class TextRequest(BaseModel):
text: str
return_probabilities: bool = False
class InterventionRequest(BaseModel):
text: str
# 端点1: 检测认知扭曲
@app.post("/api/detect")
async def detect_distortion(request: TextRequest):
"""检测文本中的认知扭曲"""
try:
result = detector.detect(
request.text,
return_probabilities=request.return_probabilities
)
return {"success": True, "data": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# 端点2: CBT干预
@app.post("/api/intervene")
async def provide_intervention(request: InterventionRequest):
"""分析并提供CBT干预"""
try:
result = cbt_system.analyze_and_intervene(request.text)
return {"success": True, "data": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# 端点3: 批量检测
@app.post("/api/detect-batch")
async def detect_batch(texts: List[str]):
"""批量检测多个文本"""
try:
results = detector.detect_batch(texts)
return {"success": True, "data": results}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
Code collapsed
关键要点
- Transformer适合细粒度分类:8种认知扭曲类型
- CBT干预提供实用价值:不仅仅是检测
- 多语言支持:中英文BERT模型
- 实时检测支持:推理时间<100ms
- 可解释性很重要:提供挑战和重构建议
常见问题
数据从哪里来?
数据来源:
- CBT治疗记录(匿名化)
- 心理健康论坛帖子
- 学术研究数据集
- 人工标注的示例
如何提高准确率?
- 增加训练数据:每个类别>1000样本
- 数据增强:同义词替换、回译
- 领域适应:在特定领域数据上微调
- 集成方法:多模型投票
可以扩展到其他语言吗?
可以。使用多语言BERT(如XLM-RoBERTa):
code
MODEL_NAME = "xlm-roberta-base"
Code collapsed
参考资料
- CBT认知扭曲理论
- Transformers官方文档
- 心理健康NLP研究论文
- Beck认知疗法理论
发布日期:2026年3月8日 最后更新:2026年3月8日