Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Linear regression with sklearn

In this example we use some generated data to give a basic feeling.

In the first Jupiter Notebook file we see how we can train a model:

  • examples/ml/basic_linear_regression.ipynb

Then we have two files, one using Jupyter notebook, one a plain Python file demonstrating how we can use the model.

  • examples/ml/use_basic_linear_expression.ipynb
from joblib import load
import sys

if len(sys.argv) < 2:
    exit(f"Usage: {sys.argv[0]} Xes")

input_values = []
for val in sys.argv[1:]:
    input_values.append([float(val)])

model = load('linear.joblib')
print(model.predict(input_values))

examples/ml/basic_linear_regression.py

#get_ipython().system('pip install numpy pandas scikit-learn matplotlib joblib')

import sys
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from joblib import dump
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

def generate_data_with_noise(size, noise_level):
    x = np.arange(size)
    noise = noise_level * (np.random.rand(size)-0.5)
    y = x + noise
    df = pd.DataFrame(data=[x, y]).T
    df = pd.DataFrame({"x":x, "y":y})
    return df

def main():
    if len(sys.argv) != 3:
        exit(f"Usage: {sys.argv[0]} SIZE NOISE")
    size, noise = int(sys.argv[1]), int(sys.argv[2])

    np.random.seed(42)
    df = generate_data_with_noise(size, noise)
    #df.plot()
    #df.plot.scatter(x='x', y='y', c='Blue');
    X = df[["x"]]
    #print(X)
    y = df["y"]
    print(y.head(3))

    #plt.scatter(X["x"], y, s=20);
    #plt.plot([0, size], [0, size], color="red");

    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=4)
    print(len(y_train), len(y_test))
    model = LinearRegression()
    model.fit(x_train, y_train)
    print(f"intercept: {model.intercept_}  coef: {model.coef_}")
    print('train coefficient of determination:', model.score(x_train, y_train))
    print('test coefficient of determination:', model.score(x_test, y_test))
    print('coefficient of determination:', model.score(X, y))

    x1, x2 = min(df["x"]), max(df["x"]) # 0, size-1
    y1, y2 = model.predict(pd.DataFrame({'x': [x1, x2]}))
    plt.plot([x1, x2], [y1, y2], color="red");
    plt.scatter(df["x"], df["y"]);
    plt.show()
    #dump(model, 'linear.joblib')

main()