Optimize Performance
Problem
Python applications can suffer from performance bottlenecks due to the interpreted nature of the language, inefficient algorithms, excessive memory allocation, or inappropriate data structure choices. Profiling and optimization require systematic approaches to identify and resolve actual performance issues rather than premature optimization.
Solution
1. CPU Profiling with cProfile
Identify CPU-intensive code sections using Python’s built-in profiler.
import cProfile
import pstats
from pstats import SortKey
def slow_function():
"""Example function with performance issues"""
total = 0
for i in range(1000000):
total += i
return total
def another_function():
"""Another function to profile"""
result = []
for i in range(100000):
result.append(i ** 2)
return result
def main():
"""Main function calling multiple functions"""
slow_function()
another_function()
if __name__ == '__main__':
profiler = cProfile.Profile()
profiler.enable()
main()
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats(SortKey.CUMULATIVE)
stats.print_stats(10) # Show top 10 functions
cProfile.run('slow_function()', 'profile_stats')
with cProfile.Profile() as pr:
main()
stats = pstats.Stats(pr)
stats.sort_stats(SortKey.TIME)
stats.print_stats()
stats = pstats.Stats('profile_stats')
stats.strip_dirs()
stats.sort_stats('cumulative')
stats.print_stats(10)
stats.print_stats('slow_function')2. Memory Profiling with memory_profiler
Track memory usage line-by-line to identify memory leaks and inefficient allocations.
from memory_profiler import profile
@profile
def memory_intensive():
"""Function with high memory usage"""
# List comprehension - creates entire list in memory
data = [i for i in range(1000000)]
# Dictionary creation
mapping = {i: i**2 for i in range(100000)}
# String concatenation (inefficient)
result = ""
for i in range(10000):
result += str(i)
return sum(data)
import tracemalloc
tracemalloc.start()
data = [i for i in range(1000000)]
current, peak = tracemalloc.get_traced_memory()
print(f"Current memory usage: {current / 10**6:.2f} MB")
print(f"Peak memory usage: {peak / 10**6:.2f} MB")
tracemalloc.stop()3. Optimize with NumPy for Numerical Operations
Replace pure Python loops with vectorized NumPy operations for massive speed gains.
import numpy as np
import time
def python_sum(n):
total = 0
for i in range(n):
total += i
return total
def numpy_sum(n):
return np.arange(n).sum()
n = 1_000_000
start = time.perf_counter()
result1 = python_sum(n)
python_time = time.perf_counter() - start
start = time.perf_counter()
result2 = numpy_sum(n)
numpy_time = time.perf_counter() - start
print(f"Python: {python_time:.4f}s")
print(f"NumPy: {numpy_time:.4f}s")
print(f"Speedup: {python_time / numpy_time:.1f}x")
data = np.random.rand(1000, 1000)
result = (data * 2 + 3) / 5 # All at once
result_slow = np.zeros_like(data)
for i in range(data.shape[0]):
for j in range(data.shape[1]):
result_slow[i, j] = (data[i, j] * 2 + 3) / 54. Use Appropriate Data Structures
Choose the right data structure for your use case.
import time
data_list = list(range(10000))
data_set = set(range(10000))
start = time.perf_counter()
for _ in range(1000):
9999 in data_list
list_time = time.perf_counter() - start
start = time.perf_counter()
for _ in range(1000):
9999 in data_set
set_time = time.perf_counter() - start
print(f"List: {list_time:.4f}s, Set: {set_time:.4f}s")
print(f"Set is {list_time / set_time:.1f}x faster")
from collections import deque
queue_list = []
for i in range(10000):
queue_list.append(i)
if len(queue_list) > 100:
queue_list.pop(0) # O(n) operation
queue_deque = deque(maxlen=100)
for i in range(10000):
queue_deque.append(i) # O(1) operation5. Generator Expressions for Memory Efficiency
Use generators instead of list comprehensions for large datasets.
large_list = [x**2 for x in range(1_000_000)]
total = sum(large_list)
large_gen = (x**2 for x in range(1_000_000))
total = sum(large_gen)
def read_large_file_list(filepath):
"""Loads entire file into memory"""
with open(filepath) as f:
return [line.strip() for line in f]
def read_large_file_gen(filepath):
"""Yields lines one at a time"""
with open(filepath) as f:
for line in f:
yield line.strip()
for line in read_large_file_gen('large_file.txt'):
process(line)6. Caching and Memoization
Cache expensive function calls to avoid redundant computation.
from functools import lru_cache
import time
def fibonacci_slow(n):
if n < 2:
return n
return fibonacci_slow(n-1) + fibonacci_slow(n-2)
@lru_cache(maxsize=None)
def fibonacci_fast(n):
if n < 2:
return n
return fibonacci_fast(n-1) + fibonacci_fast(n-2)
start = time.perf_counter()
result1 = fibonacci_slow(30)
slow_time = time.perf_counter() - start
start = time.perf_counter()
result2 = fibonacci_fast(30)
fast_time = time.perf_counter() - start
print(f"Without cache: {slow_time:.4f}s")
print(f"With cache: {fast_time:.4f}s")
print(f"Speedup: {slow_time / fast_time:.1f}x")
_cache = {}
def expensive_computation(x, y):
key = (x, y)
if key not in _cache:
time.sleep(0.1) # Simulate expensive work
_cache[key] = x ** y
return _cache[key]7. String Optimization
Optimize string operations for performance.
import time
def concatenate_slow(n):
result = ""
for i in range(n):
result += str(i)
return result
def concatenate_fast(n):
parts = []
for i in range(n):
parts.append(str(i))
return ''.join(parts)
def concatenate_fastest(n):
return ''.join(str(i) for i in range(n))
n = 10000
start = time.perf_counter()
concatenate_slow(n)
slow_time = time.perf_counter() - start
start = time.perf_counter()
concatenate_fast(n)
fast_time = time.perf_counter() - start
start = time.perf_counter()
concatenate_fastest(n)
fastest_time = time.perf_counter() - start
print(f"Slow: {slow_time:.4f}s")
print(f"Fast: {fast_time:.4f}s")
print(f"Fastest: {fastest_time:.4f}s")8. Use Built-in Functions and Libraries
Leverage C-optimized built-in functions instead of reimplementing.
def manual_sum(data):
total = 0
for item in data:
total += item
return total
data = list(range(1_000_000))
result = sum(data)
def manual_max(data):
maximum = data[0]
for item in data:
if item > maximum:
maximum = item
return maximum
result = max(data)
def bubble_sort(arr):
n = len(arr)
for i in range(n):
for j in range(0, n-i-1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
return arr
sorted_data = sorted(data)How It Works
Profiling Workflow
- Baseline Measurement: Profile code without changes to establish baseline
- Identify Hotspots: Focus on functions consuming most time/memory
- Optimize: Apply targeted optimizations to hotspots only
- Re-measure: Profile again to verify improvements
- Iterate: Repeat until performance targets met
Memory Optimization Principles
- Generators: Lazy evaluation reduces peak memory usage
- In-place Operations: Modify data structures instead of creating copies
- Reference Counting: Python’s garbage collection frees unused objects
- Data Structure Choice: Use appropriate containers (set for membership, deque for queues)
CPU Optimization Techniques
- Vectorization: NumPy/Pandas operations run at C speed
- Caching: Memoization trades memory for CPU cycles
- Built-ins: C-implemented functions are faster than pure Python
- Algorithm Choice: O(n log n) sorting beats O(n²) bubble sort
Variations
Line Profiler for Line-by-Line Analysis
Use line_profiler for detailed line-by-line CPU profiling:
from line_profiler import LineProfiler
def process_data(data):
result = []
for item in data:
result.append(item ** 2)
return sum(result)
profiler = LineProfiler()
profiler.add_function(process_data)
data = list(range(10000))
profiler.runcall(process_data, data)
profiler.print_stats()
@profile # noqa
def process_data_decorated(data):
result = []
for item in data:
result.append(item ** 2)
return sum(result)PyPy for JIT Compilation
Use PyPy as drop-in replacement for CPython for automatic speed improvements:
pypy3 script.pyCython for C-Speed Extensions
Compile Python code to C for near-native performance:
def compute_sum(int n):
cdef int i, total = 0
for i in range(n):
total += i
return total
from setuptools import setup
from Cython.Build import cythonize
setup(
ext_modules=cythonize("example.pyx")
)Multiprocessing for CPU-Bound Tasks
Bypass GIL with multiprocessing for parallel CPU work:
from multiprocessing import Pool
import time
def cpu_intensive(n):
"""Simulates CPU-bound work"""
total = 0
for i in range(n):
total += i ** 2
return total
start = time.perf_counter()
results = [cpu_intensive(1_000_000) for _ in range(4)]
sequential_time = time.perf_counter() - start
start = time.perf_counter()
with Pool(processes=4) as pool:
results = pool.map(cpu_intensive, [1_000_000] * 4)
parallel_time = time.perf_counter() - start
print(f"Sequential: {sequential_time:.2f}s")
print(f"Parallel: {parallel_time:.2f}s")
print(f"Speedup: {sequential_time / parallel_time:.1f}x")Threading for I/O-Bound Tasks
Use threading for I/O-bound operations (file I/O, network requests):
import concurrent.futures
import requests
import time
urls = [
'https://httpbin.org/delay/1',
'https://httpbin.org/delay/1',
'https://httpbin.org/delay/1',
'https://httpbin.org/delay/1',
]
def fetch_url(url):
response = requests.get(url)
return response.status_code
start = time.perf_counter()
results = [fetch_url(url) for url in urls]
sequential_time = time.perf_counter() - start
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(fetch_url, urls))
parallel_time = time.perf_counter() - start
print(f"Sequential: {sequential_time:.2f}s")
print(f"Parallel: {parallel_time:.2f}s")
print(f"Speedup: {sequential_time / parallel_time:.1f}x")Common Pitfalls
Premature Optimization
Problem: Optimizing code before identifying actual bottlenecks wastes time.
Solution: Always profile first. Optimize only measured hotspots.
def optimized_function(data):
# Complex, hard-to-read optimized code
return sum(x**2 for x in data if x % 2 == 0)
def readable_function(data):
# Simple, readable code
evens = [x for x in data if x % 2 == 0]
squares = [x**2 for x in evens]
return sum(squares)Ignoring Algorithm Complexity
Problem: Micro-optimizations don’t fix O(n²) algorithms.
Solution: Choose appropriate algorithms before micro-optimizing.
def bubble_sort(arr):
n = len(arr)
for i in range(n):
for j in range(0, n-i-1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
return arr
sorted_arr = sorted(arr)Over-Caching
Problem: Excessive caching consumes memory and may slow down simple operations.
Solution: Cache selectively. Use bounded caches (maxsize parameter).
from functools import lru_cache
@lru_cache(maxsize=None)
def simple_multiply(a, b):
return a * b # Too simple to benefit from caching
@lru_cache(maxsize=128)
def expensive_computation(n):
# Complex calculation that benefits from caching
return sum(i**2 for i in range(n))Misusing Generators
Problem: Generators can’t be reused and may be slower for small datasets.
Solution: Use generators for large datasets or single-pass iteration only.
data_gen = (x**2 for x in range(10))
sum1 = sum(data_gen)
sum2 = sum(data_gen) # Returns 0! Generator exhausted
data_list = [x**2 for x in range(10)]
sum1 = sum(data_list)
sum2 = sum(data_list) # Works correctly
large_gen = (x**2 for x in range(1_000_000))
result = sum(large_gen)Profiling in Debug Mode
Problem: Profiling debug/development code gives inaccurate results.
Solution: Profile production-optimized code without debug flags.
python -m cProfile script.py
python -O -m cProfile script.py # -O disables assertionsNot Considering Trade-offs
Problem: Optimizing for speed may increase memory usage or complexity.
Solution: Balance speed, memory, and maintainability based on actual requirements.
def process_large_file_gen(filepath):
with open(filepath) as f:
for line in f:
yield expensive_transform(line)
def process_large_file_list(filepath):
with open(filepath) as f:
data = [expensive_transform(line) for line in f]
return dataIgnoring GIL for CPU-Bound Tasks
Problem: Threading doesn’t speed up CPU-bound Python code due to GIL.
Solution: Use multiprocessing or async I/O based on workload type.
import threading
def cpu_work(n):
return sum(i**2 for i in range(n))
threads = [threading.Thread(target=cpu_work, args=(1_000_000,)) for _ in range(4)]
for t in threads:
t.start()
for t in threads:
t.join()
from multiprocessing import Pool
with Pool(4) as pool:
results = pool.map(cpu_work, [1_000_000] * 4)Related Patterns
Related Tutorial: See Intermediate Tutorial - Performance. Related Cookbook: See Cookbook recipe “Performance Optimization”.