Deserialization
Deserialization vulnerabilities occur when untrusted data is deserialized using formats that can execute arbitrary code (Pickle, YAML, etc.).
Deserialization vulnerabilities are rated CRITICAL (CVSS 9.8) because they typically allow arbitrary code execution with no authentication required.
Unsafe Deserialization (CRITICAL)
CVSS 9.8 | CWE-502
Untrusted serialized data deserialized without proper validation.
Python Pickle
Vulnerable
Pickle can execute arbitrary code on load
import pickle
def load_agent_state(data: bytes):
# CRITICAL: Pickle can execute arbitrary Python code
return pickle.loads(data)
def save_agent_state(state):
return pickle.dumps(state)
# Malicious pickle payload:
# pickle.loads(b"cos\nsystem\n(S'rm -rf /'\ntR.")Secure
JSON is safe for data interchange
import json
from dataclasses import dataclass, asdict
from typing import List, Dict, Any
@dataclass
class AgentState:
messages: List[Dict[str, str]]
context: Dict[str, Any]
step: int = 0
def load_agent_state(data: str) -> AgentState:
parsed = json.loads(data)
return AgentState(**parsed)
def save_agent_state(state: AgentState) -> str:
return json.dumps(asdict(state))LangChain Pickle Vectors
Vulnerable
Loading serialized chains from untrusted source
from langchain.chains import load_chain
def load_user_chain(chain_path):
# DANGEROUS: User-provided pickle file
return load_chain(chain_path)
# Attacker uploads malicious .pkl fileSecure
Use safe serialization or rebuild from config
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import json
def load_user_chain(config_path):
# Load only configuration, not executable code
with open(config_path) as f:
config = json.load(f)
# Rebuild chain from safe config
prompt = PromptTemplate(
template=config["template"],
input_variables=config["input_variables"]
)
return LLMChain(llm=llm, prompt=prompt)Unsafe YAML Loading (CRITICAL)
CVSS 9.8 | CWE-502
YAML loaded with unsafe options allowing arbitrary code execution.
Vulnerable
yaml.load() can execute Python code
import yaml
def load_config(path):
with open(path) as f:
# CRITICAL: Can execute arbitrary Python
return yaml.load(f)
# Malicious YAML:
# !!python/object/apply:os.system ['whoami']Secure
yaml.safe_load() only allows basic types
import yaml
def load_config(path):
with open(path) as f:
# safe_load only allows: str, int, float, bool, None, list, dict
return yaml.safe_load(f)Always use yaml.safe_load() - The default yaml.load() is dangerous and should never be used with untrusted input.
Vector Store Deserialization
Some vector stores use pickle internally for persistence.
Vulnerable
Loading pickled vector store from user
from langchain.vectorstores import FAISS
def load_user_knowledge_base(path):
# DANGEROUS: FAISS uses pickle internally
return FAISS.load_local(path, embeddings)
# User uploads malicious .faiss file with embedded pickleSecure
Rebuild from safe document sources
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import os
ALLOWED_EXTENSIONS = {".txt", ".md", ".pdf"}
def build_knowledge_base(doc_paths):
documents = []
for path in doc_paths:
ext = os.path.splitext(path)[1].lower()
if ext not in ALLOWED_EXTENSIONS:
raise ValueError(f"Unsupported file type: {ext}")
loader = TextLoader(path)
documents.extend(loader.load())
# Build fresh from documents (no pickle)
return FAISS.from_documents(documents, embeddings)Safe Serialization Alternatives
| Format | Safety | Use Case |
|---|---|---|
| JSON | Safe | Configuration, state, API responses |
| MessagePack | Safe | Binary data, performance-critical |
| Protocol Buffers | Safe | Typed data interchange |
| YAML (safe_load) | Safe | Configuration files |
| Pickle | UNSAFE | Never use with untrusted data |
| YAML (load) | UNSAFE | Never use with untrusted data |
Defense Strategies
1. Restricted Unpickler (If Pickle Required)
import pickle
import io
ALLOWED_CLASSES = {
('builtins', 'dict'),
('builtins', 'list'),
('builtins', 'str'),
('builtins', 'int'),
('builtins', 'float'),
}
class RestrictedUnpickler(pickle.Unpickler):
def find_class(self, module, name):
if (module, name) not in ALLOWED_CLASSES:
raise pickle.UnpicklingError(f"Forbidden class: {module}.{name}")
return super().find_class(module, name)
def safe_pickle_loads(data):
return RestrictedUnpickler(io.BytesIO(data)).load()2. Content-Type Validation
import magic
def validate_file_type(file_path, expected_type):
detected = magic.from_file(file_path, mime=True)
if detected != expected_type:
raise ValueError(f"Expected {expected_type}, got {detected}")3. Signature Verification
import hmac
import hashlib
def sign_data(data: bytes, secret: bytes) -> bytes:
signature = hmac.new(secret, data, hashlib.sha256).digest()
return signature + data
def verify_and_load(signed_data: bytes, secret: bytes) -> dict:
signature = signed_data[:32]
data = signed_data[32:]
expected = hmac.new(secret, data, hashlib.sha256).digest()
if not hmac.compare_digest(signature, expected):
raise ValueError("Invalid signature - data may be tampered")
return json.loads(data) # Use safe JSON, not pickleChecklist
- Replace all
pickle.load()with JSON or MessagePack - Replace all
yaml.load()withyaml.safe_load() - Validate file types before processing
- Sign serialized data if integrity is critical
- Audit LangChain/LlamaIndex usage for pickle vectors
- Never load serialized models from untrusted sources
Last updated on