ft_linear_regression

2020-11-22 18:05:40 +01:00 · 2020-11-22 18:05:40 +01:00 · 6898b2c39b
parent d5134e3903
commit 6898b2c39b
6 changed files with 299 additions and 0 deletions
--- a/LinearRegression.py
+++ b/LinearRegression.py
@ -0,0 +1,83 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+class LinearRegression:
+    def __init__(self, thetas, data, epochs = 1000, learning_rate = 0.001):
+        self.cost = []
+        self.epochs = epochs
+        self.learning_rate = learning_rate
+        self.raw_thetas = thetas
+        self.raw_data = data
+        self.__get_scaled_data()
+        try:
+            self.__get_scaled_thetas()
+        except:
+            print('error in raw_thetas format, setting thetas to 0')
+            self.thetas = np.zeros(self.raw_data.shape[1])
+            self.raw_thetas = np.zeros(self.raw_data.shape[1])
+
+    def gradient_descent(self):
+        for i in range (0, self.epochs):
+            self.thetas = self.__gradient_descent_epoch()
+            self.cost.append(self.get_cost())
+        self.raw_thetas = np.empty(len(self.thetas))
+        for i in range(1, self.thetas.shape[0]):
+            self.raw_thetas[i] = self.thetas[i] / (max(self.raw_data[:, i - 1]) - min(self.raw_data[:, i - 1]))
+        self.raw_thetas[0] = np.mean(self.raw_data[:, self.raw_data.shape[1] - 1])
+        for i in range(1, self.raw_data.shape[1]):
+            self.raw_thetas[0] -= self.raw_thetas[i] * np.mean(self.raw_data[:, i - 1])
+
+    def get_cost(self):
+        cost = 0;
+        for i in range (1, self.data.shape[0]):
+            cost += (self.__predict(i) - self.data_y[i]) ** 2
+        cost /= float(self.data.shape[0])
+        return cost
+
+    def show(self):
+        plt.subplot(1, 2, 1)
+        plt.plot(self.raw_data[:, 0], self.raw_data[:, 1], 'r.')
+        print(max(self.raw_data[:, 0]))
+        t0 = np.mean(self.data_y) - (np.mean(self.raw_data[:, 0]) * self.thetas[1])
+        plt.plot([0, max(self.raw_data[:, 0])], [self.raw_thetas[0], self.raw_thetas[0] + self.raw_thetas[1] * max(self.raw_data[:, 0])])
+        plt.ylabel('y')
+        plt.xlabel('x')
+        plt.subplot(1, 2, 2)
+        plt.plot(self.cost)
+        plt.ylabel('cost')
+        plt.xlabel('epochs')
+        plt.tight_layout()
+        plt.show()
+
+# Adds a column filled with 1 (So Theta0 * x0 = Theta0) and apply MinMax normalization to the raw data
+    def __get_scaled_data(self):
+        self.data = np.empty(shape=(self.raw_data.shape[0], self.raw_data.shape[1]))
+        self.data[:, 0] = 1
+        self.data_y = np.empty(shape=(1, self.raw_data.shape[0]))
+        self.data_y = self.raw_data[:, self.raw_data.shape[1] - 1]
+        for i in range(0, self.data.shape[1] - 1):
+            self.data[:, i + 1] = self.raw_data[:, i]
+        for i in range(1, self.data.shape[1]):
+            self.data[:, i] = (self.data[:, i] - min(self.data[:, i])) / (max(self.data[:, i]) - min(self.data[:, i]))
+
+    def __get_scaled_thetas(self):
+        self.thetas = np.empty(self.raw_data.shape[1])
+        self.thetas[0] = self.raw_thetas[len(self.raw_thetas) - 1]
+        for i in range(0, self.raw_data.shape[1] - 1):
+            self.thetas[i + 1] = self.raw_thetas[i + 1] * (max(self.raw_data[:, i]) - min(self.raw_data[:, i]))
+
+    def __gradient_descent_epoch(self):
+        new_thetas = np.zeros(self.data.shape[1])
+        for i in range(len(self.data)):
+            delta = self.__predict(i) - self.data_y[i]
+            for j in range(self.data.shape[1]):
+                new_thetas[j] += delta * self.data[i, j]
+        for i in range(self.data.shape[1]):
+            new_thetas[i] = self.thetas[i] - self.learning_rate / float(len(self.data)) * new_thetas[i]
+        return new_thetas
+
+    def __predict(self, row):
+        h = 0
+        for i in range(self.data.shape[1]):
+            h += self.thetas[i] * self.data[row, i]
+        return (h);
--- a/data.csv
+++ b/data.csv
@ -0,0 +1,25 @@
+km,price
+240000,3650
+139800,3800
+150500,4400
+185530,4450
+176000,5250
+114800,5350
+166800,5800
+89000,5990
+144500,5999
+84000,6200
+82029,6390
+63060,6390
+74000,6600
+97500,6800
+67000,6800
+76025,6900
+48235,6900
+93000,6990
+60949,7490
+65674,7555
+54000,7990
+68500,7990
+22899,7990
+61789,8290
--- a/data_offset.csv
+++ b/data_offset.csv
@ -0,0 +1,25 @@
+km,price
+240000,8650
+139800,8800
+150500,9400
+185530,9450
+176000,10250
+114800,10350
+166800,10800
+89000,10990
+144500,10999
+84000,11200
+82029,11390
+63060,11390
+74000,11600
+97500,11800
+67000,11800
+76025,11900
+48235,11900
+93000,11990
+60949,12490
+65674,12555
+54000,12990
+68500,12990
+22899,12990
+61789,13290
--- a/multi_data.csv
+++ b/multi_data.csv
@ -0,0 +1,25 @@
+age,km,price
+10,240000,8050
+15,139800,8200
+18,150500,7400
+2,185530,9250
+6,176000,9950
+8,114800,9950
+10,166800,9800
+25,89000,7990
+4,144500,10499
+1,84000,11000
+0,82029,12000
+2,63060,11300
+7,74000,11000
+4,97500,11000
+9,67000,10000
+4,76025,11200
+4,48235,11300
+7,93000,10500
+1,60949,12300
+6,65674,11800
+5,54000,12500
+4,68500,12600
+2,22899,12800
+7,61789,12590
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,86 @@
+import argparse
+import numpy as np
+from train import train
+from LinearRegression import LinearRegression
+from matplotlib import animation
+
+
+def get_thetas(path):
+    try:
+        thetas = np.genfromtxt(path, delimiter=',')
+    except:
+        print ('info: path not found, asking for train')
+        return "error"
+    try:
+        return [thetas[0], thetas[1]]
+    except:
+        print ('warning: path in wrong format, asking for train')
+        return "error"
+
+def get_price(mileage, thetas):
+    return (thetas[0] + mileage * thetas[1])
+
+def predict_subject():
+    thetas = get_thetas('thetas.csv')
+    if thetas == "error":
+        print("")
+        try:
+            input_thetas = input("model isn't trained, would you like to train it before predicting car price ? y/n\n");
+            if (input_thetas == 'y'):
+                raw_data = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+                train(raw_data, np.zeros(2), 'thetas.csv', False);
+                thetas = np.genfromtxt('thetas.csv', delimiter=',')
+            else:
+                print ('info: wrong input format, setting thetas to 0')
+                thetas = [0, 0]
+        except:
+            print ('info: wrong input format or fail to train, setting thetas to 0')
+            thetas = [0, 0]
+    try:
+        mileage = int(eval(input("Enter mileage\n")))
+    except:
+        print ('info: input a number')
+        return
+    if mileage < 0:
+        print ('info: mileage should be superior to 0 ! aborting')
+        return
+    price = get_price(mileage, thetas)
+    if price < 0:
+        print('This car belongs in a museum ! (price inferior to 0)')
+    else:
+        print('Predicted car value is ', price,)
+
+def get_y(x, thetas):
+    h = 0
+    for i in range(len(x)):
+        h += x[i] * thetas[i]
+    return h
+
+def predict(thetas_path):
+    try:
+        thetas = np.genfromtxt(thetas_path,delimiter=',')
+    except:
+        print('wrong name or format')
+        return
+    x = np.empty(len(thetas) - 1)
+    x[0] = 1;
+    for i in range(1 ,len(thetas) - 1):
+        try:
+            s = "Enter feature " + str(i) + "\n"
+            x[i] = int(eval(input(s)))
+        except:
+            print('input a number')
+            return
+    print("y is equal to ", get_y(x, thetas))
+
+def main():
+    parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
+    parser.add_argument("-p", "--path", type=str, default=False, help="thetas file path")
+    args = parser.parse_args()
+    if (args.path == False):
+        predict_subject()
+    else:
+        predict(args.path)
+
+if __name__ == '__main__':
+    main()
--- a/train.py
+++ b/train.py
@ -0,0 +1,55 @@
+import argparse
+import numpy as np
+from LinearRegression import LinearRegression
+from matplotlib import animation
+
+def get_thetas():
+    try:
+        thetas = np.genfromtxt('thetas.csv', delimiter=',')
+    except:
+        print ("info: thetas.csv not found, setting thetas to 0")
+        return [0, 0]
+    try:
+        return [thetas[2], thetas[1]]
+    except:
+        print ("warning: thetas.csv in wrong format, setting thetas to 0")
+        return [0, 0]
+
+def train(raw_data, raw_thetas, thetas_path, visu):
+    lr = LinearRegression(thetas = raw_thetas, data = raw_data, epochs = 10000, learning_rate = 0.1)
+    print(lr.raw_data)
+    print("thetas before train")
+    print(lr.raw_thetas[:len(lr.raw_thetas) - 1])
+    lr.gradient_descent()
+    print("thetas after train")
+    print(lr.raw_thetas)
+    tosave = []
+    for i in range(len(lr.raw_thetas)):
+        tosave.append(lr.raw_thetas[i])
+    tosave.append(lr.thetas[0])
+    np.savetxt(thetas_path, tosave, delimiter=',')
+    if (visu and len(lr.thetas) == 2):
+        lr.show()
+
+def main():
+    parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
+    parser.add_argument("-p", "--path", type=str, default='data.csv', help="data file path")
+    parser.add_argument("-t", "--thetas", type=str, default=False, help="thetas file path")
+    parser.add_argument("--visu", default=False, help="plot data on graph", action="store_true")
+    args = parser.parse_args()
+    if (args.thetas == False):
+        args.thetas = 'thetas.csv'
+    try:
+        raw_data = np.genfromtxt(args.path, delimiter=',', skip_header=1)
+    except:
+        print('csv file not found or wrong')
+        return
+    try:
+        raw_thetas = np.genfromtxt(args.thetas, delimiter=',')
+    except:
+        print('thetas file not found or wrong, setting thetas to 0')
+        raw_thetas = np.zeros(raw_data.shape[1])
+    train(raw_data, raw_thetas, args.thetas, args.visu)
+
+if __name__ == '__main__':
+    main()