# Python and NumPy Basics for Machine Learning

This notebook covers the essential Python and NumPy concepts needed for machine learning, extracted from our course slides with additional practical examples.

## 1. Python Basics Review

Let's start with fundamental Python concepts that we'll use throughout the course.

In [None]:
# Variables and Data Types
# Numbers
x = 42 # integer
y = 3.14 # float
z = 2 + 3j # complex number

print(f"Integer: {x}, type: {type(x)}")
print(f"Float: {y}, type: {type(y)}")
print(f"Complex: {z}, type: {type(z)}")

In [None]:
# Collections - fundamental data structures
my_list = [1, 2, 3, 4] # mutable (can be changed)
my_tuple = (1, 2, 3, 4) # immutable (cannot be changed)
my_dict = {'a': 1, 'b': 2} # key-value pairs

print(f"List: {my_list}")
print(f"Tuple: {my_tuple}")
print(f"Dictionary: {my_dict}")

# Demonstrate mutability
my_list[0] = 10 # This works
print(f"Modified list: {my_list}")

# my_tuple[0] = 10 # This would cause an error!

In [None]:
# Control Flow - loops and conditionals
print("Even numbers from 0 to 8:")
for i in range(5):
 if i % 2 == 0:
 print(f"{i} is even")
 else:
 print(f"{i} is odd")

In [None]:
# List comprehensions - a Pythonic way to create lists
numbers = [1, 2, 3, 4, 5]
squared = [x**2 for x in numbers]

print(f"Original: {numbers}")
print(f"Squared: {squared}")

# More complex example: filter even numbers and square them
even_squared = [x**2 for x in numbers if x % 2 == 0]
print(f"Even numbers squared: {even_squared}")

## 2. Introduction to NumPy

NumPy is the foundation of scientific computing in Python. It provides efficient operations on arrays of numbers.

In [None]:
# Import NumPy (standard convention)
import numpy as np

# Check NumPy version
print(f"NumPy version: {np.__version__}")

# Why NumPy? Performance comparison
python_list = [1, 2, 3, 4]
numpy_array = np.array([1, 2, 3, 4])

print(f"Python list: {python_list}")
print(f"NumPy array: {numpy_array}")
print(f"Array type: {type(numpy_array)}")

In [None]:
# Vectorized operations - the power of NumPy
python_list = [1, 2, 3, 4]
numpy_array = np.array([1, 2, 3, 4])

# With Python lists, you need a loop
python_result = []
for x in python_list:
 python_result.append(x * 2)
print(f"Python way: {python_result}")

# With NumPy, apply operation to entire array at once
numpy_result = numpy_array * 2
print(f"NumPy way: {numpy_result}")

# This is much faster for large arrays!

## 3. Creating NumPy Arrays

There are many ways to create NumPy arrays depending on your needs.

In [None]:
# Different ways to create arrays
a = np.array([1, 2, 3, 4]) # From a list
b = np.zeros(5) # Array of zeros
c = np.ones((2, 3)) # 2x3 array of ones
d = np.arange(0, 10, 2) # [0, 2, 4, 6, 8] - start, stop, step
e = np.linspace(0, 1, 5) # 5 evenly spaced points from 0 to 1
f = np.random.random((3, 3)) # Random 3x3 matrix

print("From list:", a)
print("Zeros:", b)
print("Ones (2x3):")
print(c)
print("Range with step:", d)
print("Linspace:", e)
print("Random 3x3:")
print(f)

In [None]:
# Array properties - important to understand your data
array_2d = np.array([[1, 2, 3], [4, 5, 6]])

print(f"Array:")
print(array_2d)
print(f"Shape: {array_2d.shape}") # Dimensions: (rows, columns)
print(f"Data type: {array_2d.dtype}") # Type of elements
print(f"Number of dimensions: {array_2d.ndim}") # 1D, 2D, 3D, etc.
print(f"Total elements: {array_2d.size}") # Total number of elements
print(f"Memory usage: {array_2d.nbytes} bytes") # Memory consumption

## 4. Basic Array Operations

NumPy allows element-wise operations and mathematical functions on entire arrays.

In [None]:
# Basic arithmetic operations
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

print(f"a = {a}")
print(f"b = {b}")
print(f"a + b = {a + b}") # Element-wise addition
print(f"a - b = {a - b}") # Element-wise subtraction
print(f"a * b = {a * b}") # Element-wise multiplication (NOT matrix multiplication)
print(f"a / b = {a / b}") # Element-wise division
print(f"a ** 2 = {a ** 2}") # Element-wise power

In [None]:
# Mathematical functions
a = np.array([1, 4, 9, 16])

print(f"Original array: {a}")
print(f"Square root: {np.sqrt(a)}")
print(f"Exponential: {np.exp(a)}")
print(f"Natural log: {np.log(a)}")
print(f"Sine: {np.sin(a)}")
print(f"Cosine: {np.cos(a)}")

In [None]:
# Statistical operations
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

print(f"Data: {data}")
print(f"Sum: {np.sum(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Standard deviation: {np.std(data)}")
print(f"Minimum: {np.min(data)}")
print(f"Maximum: {np.max(data)}")
print(f"Median: {np.median(data)}")

## 5. Array Indexing and Slicing

Accessing and modifying array elements is crucial for data manipulation.

In [None]:
# 1D array indexing
arr = np.array([10, 20, 30, 40, 50])

print(f"Array: {arr}")
print(f"First element (index 0): {arr[0]}")
print(f"Last element (index -1): {arr[-1]}")
print(f"Second to fourth (index 1:4): {arr[1:4]}")
print(f"Every other element: {arr[::2]}")
print(f"Reverse array: {arr[::-1]}")

In [None]:
# 2D array indexing
matrix = np.array([[1, 2, 3], 
 [4, 5, 6], 
 [7, 8, 9]])

print("Matrix:")
print(matrix)
print(f"Element at row 0, column 1: {matrix[0, 1]}")
print(f"First row: {matrix[0, :]}")
print(f"First column: {matrix[:, 0]}")
print(f"2x2 submatrix (top-left):")
print(matrix[:2, :2])

In [None]:
# Boolean indexing - very powerful for data filtering
data = np.array([1, 5, 3, 8, 2, 9, 4])

print(f"Original data: {data}")

# Create boolean mask
mask = data > 5
print(f"Mask (elements > 5): {mask}")

# Apply mask to get elements
large_values = data[mask]
print(f"Values > 5: {large_values}")

# Can do it in one line
small_values = data[data <= 3]
print(f"Values <= 3: {small_values}")

## 6. Array Reshaping and Broadcasting

Understanding shapes and how arrays interact is crucial for machine learning.

In [None]:
# Reshaping arrays
original = np.arange(12) # [0, 1, 2, ..., 11]
print(f"Original array: {original}")
print(f"Shape: {original.shape}")

# Reshape to 2D
matrix_3x4 = original.reshape(3, 4)
print(f"\nReshaped to 3x4:")
print(matrix_3x4)

# Reshape to different dimensions
matrix_2x6 = original.reshape(2, 6)
print(f"\nReshaped to 2x6:")
print(matrix_2x6)

# Flatten back to 1D
flattened = matrix_3x4.flatten()
print(f"\nFlattened: {flattened}")

In [None]:
# Broadcasting - performing operations on arrays of different shapes

# Scalar with array
arr = np.array([1, 2, 3, 4])
result = arr + 10 # Adds 10 to each element
print(f"Array: {arr}")
print(f"Array + 10: {result}")

# Array with smaller array
matrix = np.array([[1, 2, 3], 
 [4, 5, 6]])
vector = np.array([10, 20, 30])

print(f"\nMatrix shape: {matrix.shape}")
print(matrix)
print(f"\nVector shape: {vector.shape}")
print(vector)

# Broadcasting: vector is added to each row of matrix
broadcast_result = matrix + vector
print(f"\nResult of matrix + vector:")
print(broadcast_result)

## 7. Working with Multi-dimensional Arrays

Real-world data often comes in higher dimensions (images, time series, etc.).

In [None]:
# Creating and working with 3D arrays
# Think of this as a stack of 2D matrices
array_3d = np.random.randint(0, 10, size=(2, 3, 4)) # 2 matrices of 3x4

print(f"3D array shape: {array_3d.shape}")
print(f"3D array:")
print(array_3d)

# Access different parts
print(f"\nFirst matrix (index 0):")
print(array_3d[0])

print(f"\nElement at position [1, 2, 3]: {array_3d[1, 2, 3]}")

In [None]:
# Operations along specific axes
matrix = np.array([[1, 2, 3], 
 [4, 5, 6], 
 [7, 8, 9]])

print("Matrix:")
print(matrix)

# Sum along different axes
print(f"\nSum of all elements: {np.sum(matrix)}")
print(f"Sum along axis 0 (columns): {np.sum(matrix, axis=0)}")
print(f"Sum along axis 1 (rows): {np.sum(matrix, axis=1)}")

# Mean along axes
print(f"\nMean along axis 0 (columns): {np.mean(matrix, axis=0)}")
print(f"Mean along axis 1 (rows): {np.mean(matrix, axis=1)}")

In [None]:
# Array concatenation and splitting
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Concatenate along different axes
concat_horizontal = np.concatenate([arr1, arr2])
print(f"Horizontal concatenation: {concat_horizontal}")

# For 2D arrays
mat1 = np.array([[1, 2], [3, 4]])
mat2 = np.array([[5, 6], [7, 8]])

# Stack vertically (along rows)
vertical_stack = np.vstack([mat1, mat2])
print(f"\nVertical stack:")
print(vertical_stack)

# Stack horizontally (along columns)
horizontal_stack = np.hstack([mat1, mat2])
print(f"\nHorizontal stack:")
print(horizontal_stack)

## 8. Linear Algebra with NumPy

Essential operations for machine learning algorithms.

In [None]:
# Vector operations
a = np.array([2, 4, 6])
b = np.array([1, 3, 5])

print(f"Vector a: {a}")
print(f"Vector b: {b}")

# Dot product (very important in ML)
dot_product = np.dot(a, b)
print(f"Dot product: {dot_product}")

# Vector magnitude (length)
magnitude_a = np.linalg.norm(a)
magnitude_b = np.linalg.norm(b)
print(f"Magnitude of a: {magnitude_a:.2f}")
print(f"Magnitude of b: {magnitude_b:.2f}")

# Unit vector (normalized)
unit_vector_a = a / magnitude_a
print(f"Unit vector a: {unit_vector_a}")
print(f"Magnitude of unit vector: {np.linalg.norm(unit_vector_a):.2f}")

In [None]:
# Matrix operations
A = np.array([[1, 2], 
 [3, 4], 
 [5, 6]])
B = np.array([[7, 8], 
 [9, 10]])

print("Matrix A (3x2):")
print(A)
print("\nMatrix B (2x2):")
print(B)

# Matrix multiplication (different from element-wise multiplication)
matrix_mult = np.dot(A, B) # or A @ B
print(f"\nMatrix multiplication A @ B:")
print(matrix_mult)

# Transpose
A_transpose = A.T
print(f"\nTranspose of A:")
print(A_transpose)
print(f"Shape changed from {A.shape} to {A_transpose.shape}")

In [None]:
# Example: Simple linear regression setup
# This demonstrates how linear algebra is used in ML

# Generate sample data
np.random.seed(42) # For reproducible results
n_samples, n_features = 100, 3

# Feature matrix X (each row is a data point)
X = np.random.randn(n_samples, n_features)

# True weights (what we want to learn)
true_weights = np.array([1.5, -2.0, 0.5])

# Generate target values with some noise
noise = np.random.randn(n_samples) * 0.1
y = X @ true_weights + noise # @ is matrix multiplication

print(f"Data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"True weights: {true_weights}")

# Add bias term (intercept)
X_with_bias = np.column_stack([np.ones(n_samples), X])
print(f"X with bias shape: {X_with_bias.shape}")

# Analytical solution: w = (X^T X)^(-1) X^T y
XTX_inv = np.linalg.inv(X_with_bias.T @ X_with_bias)
estimated_weights = XTX_inv @ X_with_bias.T @ y

print(f"\nEstimated weights (with bias): {estimated_weights}")
print(f"True weights (with bias=0): [0, {true_weights[0]}, {true_weights[1]}, {true_weights[2]}]")
print(f"Error: {np.abs(estimated_weights[1:] - true_weights)}")

## 9. Practical Examples for Machine Learning

Common data preprocessing tasks using NumPy.

In [None]:
# Data normalization - important preprocessing step
# Generate sample dataset
np.random.seed(42)
data = np.random.randn(50, 3) * [10, 100, 0.1] + [50, 500, 5]

print("Original data statistics:")
print(f"Mean: {data.mean(axis=0)}")
print(f"Std: {data.std(axis=0)}")
print(f"Min: {data.min(axis=0)}")
print(f"Max: {data.max(axis=0)}")

# Z-score normalization (zero mean, unit variance)
data_zscore = (data - data.mean(axis=0)) / data.std(axis=0)

print("\nAfter Z-score normalization:")
print(f"Mean: {data_zscore.mean(axis=0)}")
print(f"Std: {data_zscore.std(axis=0)}")

# Min-Max normalization (scale to [0, 1])
data_minmax = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))

print("\nAfter Min-Max normalization:")
print(f"Min: {data_minmax.min(axis=0)}")
print(f"Max: {data_minmax.max(axis=0)}")

In [None]:
# Train-test split implementation
def train_test_split_numpy(X, y, test_size=0.2, random_state=None):
 """Simple train-test split using NumPy"""
 if random_state:
 np.random.seed(random_state)
 
 n_samples = len(X)
 n_test = int(n_samples * test_size)
 
 # Random permutation of indices
 indices = np.random.permutation(n_samples)
 
 # Split indices
 test_idx = indices[:n_test]
 train_idx = indices[n_test:]
 
 return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Example usage
X = np.random.randn(100, 3)
y = np.random.randint(0, 2, 100)

X_train, X_test, y_train, y_test = train_test_split_numpy(X, y, test_size=0.2, random_state=42)

print(f"Original data: {len(X)} samples")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Test ratio: {len(X_test) / len(X):.1%}")

In [None]:
# Computing basic statistics for analysis
# Generate sample dataset
np.random.seed(42)
dataset = np.random.randn(200, 4)

print("Dataset shape:", dataset.shape)
print("\nBasic statistics:")
print(f"Mean of each feature: {dataset.mean(axis=0)}")
print(f"Standard deviation: {dataset.std(axis=0)}")
print(f"Variance: {dataset.var(axis=0)}")
print(f"Minimum values: {dataset.min(axis=0)}")
print(f"Maximum values: {dataset.max(axis=0)}")

# Correlation matrix between features
correlation_matrix = np.corrcoef(dataset.T) # Transpose for feature correlations
print(f"\nCorrelation matrix:")
print(correlation_matrix)

# Find highly correlated features (correlation > 0.5)
high_corr_mask = np.abs(correlation_matrix) > 0.5
# Remove diagonal (feature with itself)
np.fill_diagonal(high_corr_mask, False)

high_corr_pairs = np.where(high_corr_mask)
if len(high_corr_pairs[0]) > 0:
 print(f"\nHighly correlated feature pairs:")
 for i, j in zip(high_corr_pairs[0], high_corr_pairs[1]):
 print(f"Features {i} and {j}: correlation = {correlation_matrix[i, j]:.3f}")
else:
 print("\nNo highly correlated features found.")

## 10. Summary and Next Steps

You now have the essential Python and NumPy skills needed for machine learning!

In [None]:
# Quick review: What we've covered
print("Python and NumPy Basics - Summary:")
print("\n1. Python fundamentals:")
print(" - Variables and data types")
print(" - Lists, tuples, dictionaries")
print(" - Control flow and list comprehensions")

print("\n2. NumPy essentials:")
print(" - Array creation and properties")
print(" - Vectorized operations")
print(" - Indexing and slicing")
print(" - Broadcasting")
print(" - Linear algebra operations")

print("\n3. ML preprocessing:")
print(" - Data normalization")
print(" - Train-test splitting")
print(" - Statistical analysis")

print("\nYou're ready for machine learning algorithms!")

In [None]:
# Test your understanding - try these exercises:

# Exercise 1: Create a 5x5 matrix of random numbers and find:
# - The sum of each row
# - The maximum value in each column
# - All values greater than 0.5

print("Exercise 1:")
matrix = np.random.random((5, 5))
print("Random 5x5 matrix:")
print(matrix)
print(f"Sum of each row: {matrix.sum(axis=1)}")
print(f"Max of each column: {matrix.max(axis=0)}")
print(f"Number of values > 0.5: {np.sum(matrix > 0.5)}")

# Exercise 2: Normalize a dataset and verify the result
print("\nExercise 2:")
data = np.random.randn(100, 3) * [5, 10, 2] + [10, 50, 5]
normalized = (data - data.mean(axis=0)) / data.std(axis=0)
print(f"Original mean: {data.mean(axis=0)}")
print(f"Normalized mean: {normalized.mean(axis=0)}")
print(f"Normalized std: {normalized.std(axis=0)}")