ft_linear_regression

2020-11-22 18:05:40 +01:00 · 2020-11-22 18:05:40 +01:00 · 6898b2c39b
parent d5134e3903
commit 6898b2c39b
6 changed files with 299 additions and 0 deletions
--- a/LinearRegression.py
+++ b/LinearRegression.py
@ -0,0 +1,83 @@
 import numpy as np
 import matplotlib.pyplot as plt
 class LinearRegression:
    def __init__(self, thetas, data, epochs = 1000, learning_rate = 0.001):
        self.cost = []
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.raw_thetas = thetas
        self.raw_data = data
        self.__get_scaled_data()
        try:
            self.__get_scaled_thetas()
        except:
            print('error in raw_thetas format, setting thetas to 0')
            self.thetas = np.zeros(self.raw_data.shape[1])
            self.raw_thetas = np.zeros(self.raw_data.shape[1])
    def gradient_descent(self):
        for i in range (0, self.epochs):
            self.thetas = self.__gradient_descent_epoch()
            self.cost.append(self.get_cost())
        self.raw_thetas = np.empty(len(self.thetas))
        for i in range(1, self.thetas.shape[0]):
            self.raw_thetas[i] = self.thetas[i] / (max(self.raw_data[:, i - 1]) - min(self.raw_data[:, i - 1]))
        self.raw_thetas[0] = np.mean(self.raw_data[:, self.raw_data.shape[1] - 1])
        for i in range(1, self.raw_data.shape[1]):
            self.raw_thetas[0] -= self.raw_thetas[i] * np.mean(self.raw_data[:, i - 1])
    def get_cost(self):
        cost = 0;
        for i in range (1, self.data.shape[0]):
            cost += (self.__predict(i) - self.data_y[i]) ** 2
        cost /= float(self.data.shape[0])
        return cost
    def show(self):
        plt.subplot(1, 2, 1)
        plt.plot(self.raw_data[:, 0], self.raw_data[:, 1], 'r.')
        print(max(self.raw_data[:, 0]))
        t0 = np.mean(self.data_y) - (np.mean(self.raw_data[:, 0]) * self.thetas[1])
        plt.plot([0, max(self.raw_data[:, 0])], [self.raw_thetas[0], self.raw_thetas[0] + self.raw_thetas[1] * max(self.raw_data[:, 0])])
        plt.ylabel('y')
        plt.xlabel('x')
        plt.subplot(1, 2, 2)
        plt.plot(self.cost)
        plt.ylabel('cost')
        plt.xlabel('epochs')
        plt.tight_layout()
        plt.show()
 # Adds a column filled with 1 (So Theta0 * x0 = Theta0) and apply MinMax normalization to the raw data
    def __get_scaled_data(self):
        self.data = np.empty(shape=(self.raw_data.shape[0], self.raw_data.shape[1]))
        self.data[:, 0] = 1
        self.data_y = np.empty(shape=(1, self.raw_data.shape[0]))
        self.data_y = self.raw_data[:, self.raw_data.shape[1] - 1]
        for i in range(0, self.data.shape[1] - 1):
            self.data[:, i + 1] = self.raw_data[:, i]
        for i in range(1, self.data.shape[1]):
            self.data[:, i] = (self.data[:, i] - min(self.data[:, i])) / (max(self.data[:, i]) - min(self.data[:, i]))
    def __get_scaled_thetas(self):
        self.thetas = np.empty(self.raw_data.shape[1])
        self.thetas[0] = self.raw_thetas[len(self.raw_thetas) - 1]
        for i in range(0, self.raw_data.shape[1] - 1):
            self.thetas[i + 1] = self.raw_thetas[i + 1] * (max(self.raw_data[:, i]) - min(self.raw_data[:, i]))
    def __gradient_descent_epoch(self):
        new_thetas = np.zeros(self.data.shape[1])
        for i in range(len(self.data)):
            delta = self.__predict(i) - self.data_y[i]
            for j in range(self.data.shape[1]):
                new_thetas[j] += delta * self.data[i, j]
        for i in range(self.data.shape[1]):
            new_thetas[i] = self.thetas[i] - self.learning_rate / float(len(self.data)) * new_thetas[i]
        return new_thetas
    def __predict(self, row):
        h = 0
        for i in range(self.data.shape[1]):
            h += self.thetas[i] * self.data[row, i]
        return (h);
--- a/data.csv
+++ b/data.csv
@ -0,0 +1,25 @@
 km,price
 240000,3650
 139800,3800
 150500,4400
 185530,4450
 176000,5250
 114800,5350
 166800,5800
 89000,5990
 144500,5999
 84000,6200
 82029,6390
 63060,6390
 74000,6600
 97500,6800
 67000,6800
 76025,6900
 48235,6900
 93000,6990
 60949,7490
 65674,7555
 54000,7990
 68500,7990
 22899,7990
 61789,8290
--- a/data_offset.csv
+++ b/data_offset.csv
@ -0,0 +1,25 @@
 km,price
 240000,8650
 139800,8800
 150500,9400
 185530,9450
 176000,10250
 114800,10350
 166800,10800
 89000,10990
 144500,10999
 84000,11200
 82029,11390
 63060,11390
 74000,11600
 97500,11800
 67000,11800
 76025,11900
 48235,11900
 93000,11990
 60949,12490
 65674,12555
 54000,12990
 68500,12990
 22899,12990
 61789,13290
--- a/multi_data.csv
+++ b/multi_data.csv
@ -0,0 +1,25 @@
 age,km,price
 10,240000,8050
 15,139800,8200
 18,150500,7400
 2,185530,9250
 6,176000,9950
 8,114800,9950
 10,166800,9800
 25,89000,7990
 4,144500,10499
 1,84000,11000
 0,82029,12000
 2,63060,11300
 7,74000,11000
 4,97500,11000
 9,67000,10000
 4,76025,11200
 4,48235,11300
 7,93000,10500
 1,60949,12300
 6,65674,11800
 5,54000,12500
 4,68500,12600
 2,22899,12800
 7,61789,12590
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,86 @@
 import argparse
 import numpy as np
 from train import train
 from LinearRegression import LinearRegression
 from matplotlib import animation
 def get_thetas(path):
    try:
        thetas = np.genfromtxt(path, delimiter=',')
    except:
        print ('info: path not found, asking for train')
        return "error"
    try:
        return [thetas[0], thetas[1]]
    except:
        print ('warning: path in wrong format, asking for train')
        return "error"
 def get_price(mileage, thetas):
    return (thetas[0] + mileage * thetas[1])
 def predict_subject():
    thetas = get_thetas('thetas.csv')
    if thetas == "error":
        print("")
        try:
            input_thetas = input("model isn't trained, would you like to train it before predicting car price ? y/n\n");
            if (input_thetas == 'y'):
                raw_data = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
                train(raw_data, np.zeros(2), 'thetas.csv', False);
                thetas = np.genfromtxt('thetas.csv', delimiter=',')
            else:
                print ('info: wrong input format, setting thetas to 0')
                thetas = [0, 0]
        except:
            print ('info: wrong input format or fail to train, setting thetas to 0')
            thetas = [0, 0]
    try:
        mileage = int(eval(input("Enter mileage\n")))
    except:
        print ('info: input a number')
        return
    if mileage < 0:
        print ('info: mileage should be superior to 0 ! aborting')
        return
    price = get_price(mileage, thetas)
    if price < 0:
        print('This car belongs in a museum ! (price inferior to 0)')
    else:
        print('Predicted car value is ', price,)
 def get_y(x, thetas):
    h = 0
    for i in range(len(x)):
        h += x[i] * thetas[i]
    return h
 def predict(thetas_path):
    try:
        thetas = np.genfromtxt(thetas_path,delimiter=',')
    except:
        print('wrong name or format')
        return
    x = np.empty(len(thetas) - 1)
    x[0] = 1;
    for i in range(1 ,len(thetas) - 1):
        try:
            s = "Enter feature " + str(i) + "\n"
            x[i] = int(eval(input(s)))
        except:
            print('input a number')
            return
    print("y is equal to ", get_y(x, thetas))
 def main():
    parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
    parser.add_argument("-p", "--path", type=str, default=False, help="thetas file path")
    args = parser.parse_args()
    if (args.path == False):
        predict_subject()
    else:
        predict(args.path)
 if __name__ == '__main__':
    main()
--- a/train.py
+++ b/train.py
@ -0,0 +1,55 @@
 import argparse
 import numpy as np
 from LinearRegression import LinearRegression
 from matplotlib import animation
 def get_thetas():
    try:
        thetas = np.genfromtxt('thetas.csv', delimiter=',')
    except:
        print ("info: thetas.csv not found, setting thetas to 0")
        return [0, 0]
    try:
        return [thetas[2], thetas[1]]
    except:
        print ("warning: thetas.csv in wrong format, setting thetas to 0")
        return [0, 0]
 def train(raw_data, raw_thetas, thetas_path, visu):
    lr = LinearRegression(thetas = raw_thetas, data = raw_data, epochs = 10000, learning_rate = 0.1)
    print(lr.raw_data)
    print("thetas before train")
    print(lr.raw_thetas[:len(lr.raw_thetas) - 1])
    lr.gradient_descent()
    print("thetas after train")
    print(lr.raw_thetas)
    tosave = []
    for i in range(len(lr.raw_thetas)):
        tosave.append(lr.raw_thetas[i])
    tosave.append(lr.thetas[0])
    np.savetxt(thetas_path, tosave, delimiter=',')
    if (visu and len(lr.thetas) == 2):
        lr.show()
 def main():
    parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
    parser.add_argument("-p", "--path", type=str, default='data.csv', help="data file path")
    parser.add_argument("-t", "--thetas", type=str, default=False, help="thetas file path")
    parser.add_argument("--visu", default=False, help="plot data on graph", action="store_true")
    args = parser.parse_args()
    if (args.thetas == False):
        args.thetas = 'thetas.csv'
    try:
        raw_data = np.genfromtxt(args.path, delimiter=',', skip_header=1)
    except:
        print('csv file not found or wrong')
        return
    try:
        raw_thetas = np.genfromtxt(args.thetas, delimiter=',')
    except:
        print('thetas file not found or wrong, setting thetas to 0')
        raw_thetas = np.zeros(raw_data.shape[1])
    train(raw_data, raw_thetas, args.thetas, args.visu)
 if __name__ == '__main__':
    main()