Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Polynomial Regression

  • When we allow for a function like a + bx + cx^2 + dx^4 ... (given a single feature x)
polynomial_regression.ipynb
#get_ipython().system('pip install numpy pandas scikit-learn matplotlib joblib')

import sys
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from joblib import dump
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


def generate_data_with_noise(size, noise_level):
    x = np.arange(size)
    noise = noise_level * (np.random.rand(size)-0.5)
    y = x*x + noise
    df = pd.DataFrame(data=[x, y]).T
    df = pd.DataFrame({"x":x, "y":y})
    return df

def main():
    if len(sys.argv) != 4:
        exit(f"Usage: {sys.argv[0]} SIZE NOISE MODE")
    size, noise, mode = int(sys.argv[1]), int(sys.argv[2]), sys.argv[3]

    np.random.seed(42)
    df = generate_data_with_noise(size, noise)
    print(df)
    #df.plot()
    #df.plot.scatter(x='x', y='y', c='Blue');
    #plt.show()
    X = df[["x"]]
    #print(X)
    y = df["y"]
    print(y.head(3))

    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=4)
    print(len(y_train), len(y_test))

    if mode == "P":
        model = Pipeline([
            ('poly_features', PolynomialFeatures(degree=3)),
            ('linear_regression', LinearRegression())
        ])
    elif mode == "L":
        model = LinearRegression()
    else:
        exit(f"Invalid mode {mode}")


    model.fit(x_train, y_train)
    #print(f"intercept: {model.intercept_}  coef: {model.coef_}")
    print('train coefficient of determination:', model.score(x_train, y_train))
    print('test coefficient of determination:', model.score(x_test, y_test))

    #x1, x2 = min(df["x"]), max(df["x"]) # 0, size-1
    #y1, y2 = model.predict(pd.DataFrame({'x': [x1, x2]}))
    #plt.plot([x1, x2], [y1, y2], color="red");
    #plt.scatter(df["x"], df["y"]);
    #plt.show()
    #dump(model, 'linear.joblib')



main()