Directions

It’s a big, big world of astronomical data, under a starry, starry night sky……..

1. Load data

library(ff)

## Loading required package: bit

## 
## Attaching package: 'bit'

## The following object is masked from 'package:base':
## 
##     xor

## Attaching package ff

## - getOption("fftempdir")=="C:/Users/byeo/AppData/Local/Temp/RtmpQLuOCO/ff"

## - getOption("ffextension")=="ff"

## - getOption("ffdrop")==TRUE

## - getOption("fffinonexit")==TRUE

## - getOption("ffpagesize")==65536

## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes

## - getOption("ffbatchbytes")==16777216 -- consider a different value for tuning your system

## - getOption("ffmaxbytes")==536870912 -- consider a different value for tuning your system

## 
## Attaching package: 'ff'

## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2

## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered

Load the data.

star <- read.csv.ffdf(file = "star2002-full.csv", header = TRUE)

Check.

class(star)

## [1] "ffdf"

names(star)

##  [1] "X1"                   "X1613423"             "X807"                
##  [4] "X20011015.2226039991" "X1613424"             "X4518"               
##  [7] "X0"                   "X0.1"                 "X654"                
## [10] "X1395"                "X20011204.1149509996" "X10.955403"          
## [13] "X2288071"             "X.0.28820264"         "X0.40731233"         
## [16] "X10.559091"

nrow(star)

## [1] 15857624

2. Some analyses

2.1 Exploration

sum(star[,5])

## [1] 2.484564e+13

2.2 Visualisation

Convert to df.

star_df <- as.data.frame(star)
names(star_df)

##  [1] "X1"                   "X1613423"             "X807"                
##  [4] "X20011015.2226039991" "X1613424"             "X4518"               
##  [7] "X0"                   "X0.1"                 "X654"                
## [10] "X1395"                "X20011204.1149509996" "X10.955403"          
## [13] "X2288071"             "X.0.28820264"         "X0.40731233"         
## [16] "X10.559091"

Not sure what these variables mean.

For illustration only.

library(ggplot2)

ggplot(star_df) + aes(x = X1613424) + geom_histogram() +
  ggtitle("Histogram of X1613424")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(star_df) + aes(x = X1613424, y = X4518) +
  geom_point(shape = 1, colour = "blue") + ggtitle("Scatter plot")

2.3 Random sample

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

star_sample <- sample_frac(star_df, 0.0001)

Check.

nrow(star_sample)

## [1] 1586

head(star_sample)

##   X1 X1613423  X807 X20011015.2226039991 X1613424 X4518 X0 X0.1 X654 X1395
## 1  0  1656447 20050             20011116  1656448     5  0    0    0     0
## 2  0  2315951 22691             20030203  2315957   159  0    1    0     4
## 3  0   464388 67046             20010916   464386     0  0    0    0     0
## 4  0   847048  9358             20010913   847049   197  0    0    0    92
## 5  3  1751961  4224             20011121  1751962  6126  0    0  721  2156
## 6  1  1722545 25879             20011114  1722546  4846  1    0  671  1852
##   X20011204.1149509996  X10.955403 X2288071  X.0.28820264   X0.40731233
## 1             20011226   0.0000000  2320011 -9.999000e+03 -9999.0000000
## 2             20030223   0.1840816  4034033  7.632686e-01     0.2873981
## 3             20020402   0.0000000  2259010 -9.999000e+03 -9999.0000000
## 4             20020329  22.8238660  2256009 -5.209564e-02     0.3843492
## 5             20020314 302.8729200  2324037 -3.319713e-01     0.5455523
## 6             20020226 258.4452800  2318011 -2.775528e-01     0.4418349
##     X10.559091
## 1 -9999.000000
## 2  -124.471260
## 3 -9999.000000
## 4    22.776802
## 5    -8.418248
## 6   -15.677345

2.4 Training validation split

Our favourite seed :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(star), 0.6 * nrow(star))
valid_index <- setdiff(1:nrow(star), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df <- star[train_index, ]
valid_df <- star[valid_index, ]

Check.

nrow(train_df)

## [1] 9514574

nrow(valid_df)

## [1] 6343050

Column names.

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

compare_df_cols_same(train_df, valid_df)

## [1] TRUE

compare_df_cols(train_df, valid_df)

##             column_name train_df valid_df
## 1          X.0.28820264  numeric  numeric
## 2                    X0  integer  integer
## 3                  X0.1  integer  integer
## 4           X0.40731233  numeric  numeric
## 5                    X1  integer  integer
## 6            X10.559091  numeric  numeric
## 7            X10.955403  numeric  numeric
## 8                 X1395  integer  integer
## 9              X1613423  integer  integer
## 10             X1613424  integer  integer
## 11 X20011015.2226039991  numeric  numeric
## 12 X20011204.1149509996  numeric  numeric
## 13             X2288071  integer  integer
## 14                X4518  integer  integer
## 15                 X654  integer  integer
## 16                 X807  integer  integer

3. A model

Build the model.

Just a random model for illustration.

No idea what the variables mean :-)

regression_model <- lm(X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, data = train_df)
summary(regression_model)

## 
## Call:
## lm(formula = X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, 
##     data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -610.75   -9.15   -0.06    9.51  725.01 
## 
## Coefficients:
##                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)   5.430e-01  5.103e-02     10.640   <2e-16 ***
## X10.955403   -7.295e-08  4.330e-07     -0.168    0.866    
## X2288071     -2.047e-07  1.711e-08    -11.964   <2e-16 ***
## X.0.28820264  1.000e+00  2.796e-06 357706.210   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.17 on 9514570 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 4.351e+10 on 3 and 9514570 DF,  p-value: < 2.2e-16

Predict and check the accuracy.

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

regression_pred <- predict(regression_model, valid_df)

accuracy(regression_pred, valid_df$X10.559091)

##                   ME     RMSE      MAE     MPE     MAPE
## Test set -0.01930436 40.15217 21.60468 66.9494 69.53629

4. PostgreSQL

Install the packages if not already.

library(RPostgreSQL)

## Loading required package: DBI

library(RPostgres)

4.1 Connect to the database

Connect to the database

library(DBI)

The credentials are hypothetical for illustration. In reality, these credentials will be given to connect.

db <- "maytheforcebewithyou"

host_db <- "ec2-66-666-666-666.compute-1.amazonaws.com"

db_port <- "666"

db_user <- "thisistheway"  

db_password <- "theforceiswithme"

con <- dbConnect(RPostgres::Postgres(), dbname = db, host=host_db, port=db_port, user=db_user, password=db_password)  

dbListTables(con)

## [1] "star_sample"

4.2 Work with database 1

The data (star_sample) were written to the database.

A small sample is used for illustration. In reality, a much bigger data set can be added to the database.

Read the desired data.

df <- dbReadTable(con, "star_sample")
head(df)

##   X1 X1613423   X807 X20011015.2226039991 X1613424 X4518 X0 X0.1 X654 X1395
## 1  0  2312066  68350             20030125  2312067    24  0    0    0     5
## 2  0  1788986    898             20011109  1788987  1431  0    0   21   392
## 3  0   881270  16193             20011005   881187  5116  2    0  859  1579
## 4  2  1656019   9633             20011116  1656020  3947  0    0  431  1258
## 5  0  2313712 519929             20030127  2313713    11  0    0    0     0
## 6  1   950310   4599             20011007   950316  2881  0    0  472  1052
##   X20011204.1149509996 X10.955403 X2288071  X.0.28820264   X0.40731233
## 1             20030223   1.364189  4025016  3.440338e-01     0.4076403
## 2             20020311  43.949562  2313018 -2.651538e-01     0.3889353
## 3             20020413   0.000000  2278027 -1.082276e-01     0.4081968
## 4             20011225  18.446320  2320011 -2.602721e-01     0.4840141
## 5             20030223   0.000000  4026009 -9.999000e+03 -9999.0000000
## 6             20020430  16.253162  2280011 -7.096904e-02     0.4496775
##     X10.559091
## 1   103.046210
## 2     6.086166
## 3    20.120308
## 4    -9.494624
## 5 -9999.000000
## 6   -29.067465

nrow(df)

## [1] 1586

4.2.1 Exploration

Compute the mean.

mean(df[,5])

## [1] 1561815

Filter the data.

subset <- subset(df, X4518 > 3000)
head(subset)

##    X1 X1613423  X807 X20011015.2226039991 X1613424 X4518 X0 X0.1 X654 X1395
## 3   0   881270 16193             20011005   881187  5116  2    0  859  1579
## 4   2  1656019  9633             20011116  1656020  3947  0    0  431  1258
## 7   1  1860644  5044             20011117  1868498  6436  2    0 1075  2226
## 10  4  1739747 24471             20011124  1739748  3666  0    0  422  1567
## 17  3  1013830 14993             20011018  1013831  4256  0    0  719  1427
## 19  3   457466  6579             20011030   457465  5550  3    0 1076  1790
##    X20011204.1149509996 X10.955403 X2288071 X.0.28820264 X0.40731233 X10.559091
## 3              20020413    0.00000  2278027   -0.1082276   0.4081968  20.120308
## 4              20011225   18.44632  2320011   -0.2602721   0.4840141  -9.494624
## 7              20020621   73.12578  2321003   -0.2552435   0.3376758  22.804056
## 10             20020311   49.86357  2327043   -0.3141898   0.4940140 -11.717839
## 17             20020510   33.75652  2291009   -0.2981063   0.5178973 -11.993046
## 19             20020529   22.29049  2302020   -0.3184646   0.3911200  -3.109656

Normalise.

Small subset for illustration.

subset_2 <- df[, c("X807", "X4518")]
head(subset_2)

##     X807 X4518
## 1  68350    24
## 2    898  1431
## 3  16193  5116
## 4   9633  3947
## 5 519929    11
## 6   4599  2881

Normalise.

subset_2_norm <- sapply(subset_2, scale)
head(subset_2_norm)

##            X807       X4518
## [1,]  0.2505253 -0.69705985
## [2,] -0.5507446 -0.03122171
## [3,] -0.3690535  1.71264008
## [4,] -0.4469805  1.15943128
## [5,]  5.6148833 -0.70321187
## [6,] -0.5067800  0.65496543

4.2.2 Visualisations

Not sure what these mean.

For illustration only.

library(ggplot2)

ggplot(df) + aes(x = X1613424) + geom_histogram() +
  ggtitle("Histogram of X1613424")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df) + aes(x = X1613424, y = X4518) +
  geom_point(shape = 1, colour = "blue") + ggtitle("Scatter plot")

4.2.3 A model

Our favourite seed :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(df), 0.6 * nrow(df))
valid_index <- setdiff(1:nrow(df), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df <- df[train_index, ]
valid_df <- df[valid_index, ]

Check.

nrow(train_df)

## [1] 951

nrow(valid_df)

## [1] 635

Column names.

library(janitor)

compare_df_cols_same(train_df, valid_df)

## [1] TRUE

compare_df_cols(train_df, valid_df)

##             column_name train_df valid_df
## 1          X.0.28820264  numeric  numeric
## 2                    X0  integer  integer
## 3                  X0.1  integer  integer
## 4           X0.40731233  numeric  numeric
## 5                    X1  integer  integer
## 6            X10.559091  numeric  numeric
## 7            X10.955403  numeric  numeric
## 8                 X1395  integer  integer
## 9              X1613423  integer  integer
## 10             X1613424  integer  integer
## 11 X20011015.2226039991  numeric  numeric
## 12 X20011204.1149509996  numeric  numeric
## 13             X2288071  integer  integer
## 14                X4518  integer  integer
## 15                 X654  integer  integer
## 16                 X807  integer  integer

The model.

regression_model_2 <- lm(X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, data = train_df)
summary(regression_model_2)

## 
## Call:
## lm(formula = X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, 
##     data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -228.224   -6.901   -0.164   11.075  165.668 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  -5.699e-02  5.293e+00   -0.011    0.991    
## X10.955403   -6.629e-05  3.513e-03   -0.019    0.985    
## X2288071     -5.863e-07  1.768e-06   -0.332    0.740    
## X.0.28820264  9.998e-01  2.887e-04 3463.596   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.2 on 947 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 4.115e+06 on 3 and 947 DF,  p-value: < 2.2e-16

The prediction.

library(forecast)

regression_pred_2 <- predict(regression_model_2, valid_df)

accuracy(regression_pred_2, valid_df$X10.559091)

##               ME     RMSE      MAE      MPE   MAPE
## Test set 2.51122 41.55467 22.46505 76.20738 78.922

4.3 Work with database 2

Read the desired data.

library(dplyr)

df_2 <- dplyr::tbl(con, "star_sample")
head(df_2)

## # Source:   SQL [6 x 16]
## # Database: postgres  [vwrflyqyjdgszl@ec2-52-204-157-26.compute-1.amazonaws.com:5432/degtd5nnt804c6]
##      X1 X1613423   X807 X2001101…¹ X1613…² X4518    X0  X0.1  X654 X1395 X2001…³
##   <int>    <int>  <int>      <dbl>   <int> <int> <int> <int> <int> <int>   <dbl>
## 1     0  2312066  68350  20030125. 2312067    24     0     0     0     5  2.00e7
## 2     0  1788986    898  20011109. 1788987  1431     0     0    21   392  2.00e7
## 3     0   881270  16193  20011005.  881187  5116     2     0   859  1579  2.00e7
## 4     2  1656019   9633  20011116. 1656020  3947     0     0   431  1258  2.00e7
## 5     0  2313712 519929  20030127. 2313713    11     0     0     0     0  2.00e7
## 6     1   950310   4599  20011007.  950316  2881     0     0   472  1052  2.00e7
## # … with 5 more variables: X10.955403 <dbl>, X2288071 <int>,
## #   X.0.28820264 <dbl>, X0.40731233 <dbl>, X10.559091 <dbl>, and abbreviated
## #   variable names ¹X20011015.2226039991, ²X1613424, ³X20011204.1149509996
## # ℹ Use `colnames()` to see all variable names

4.3.1 Exploration

Compute the mean using dplyr.

df_2 %>%
  group_by(X1) %>%
  dplyr::summarise(mean = mean(X807, na.rm=TRUE))

## # Source:   SQL [8 x 2]
## # Database: postgres  [vwrflyqyjdgszl@ec2-52-204-157-26.compute-1.amazonaws.com:5432/degtd5nnt804c6]
##      X1   mean
##   <int>  <dbl>
## 1     3 16464.
## 2     5 28002.
## 3     4 18411.
## 4     0 57154.
## 5     6 12493 
## 6     2 16472.
## 7     7  7304 
## 8     1 32527.

Filter the data using dplyr.

subset_3 <- filter(df_2, X4518 > 3000, X1395 <= 1000)
head(subset_3)

## # Source:   SQL [6 x 16]
## # Database: postgres  [vwrflyqyjdgszl@ec2-52-204-157-26.compute-1.amazonaws.com:5432/degtd5nnt804c6]
##      X1 X1613423  X807 X20011015…¹ X1613…² X4518    X0  X0.1  X654 X1395 X2001…³
##   <int>    <int> <int>       <dbl>   <int> <int> <int> <int> <int> <int>   <dbl>
## 1     3  1764917 43931   20011022. 1764916  5100     0     0   835   865  2.00e7
## 2     5   823680 70007   20010914.  823687  4610     1     0   741   782  2.00e7
## 3     3   372932  2055   20010925.  372925  4327     1     0   636   990  2.00e7
## 4     4   814282 68003   20011001.  814286  4833     1     0   812   992  2.00e7
## 5     4  1754094  9831   20010926. 1754093  5316     3     0   932   992  2.00e7
## 6     0   381609   574   20010925.  381608  5015     0     0     0     0  2.00e7
## # … with 5 more variables: X10.955403 <dbl>, X2288071 <int>,
## #   X.0.28820264 <dbl>, X0.40731233 <dbl>, X10.559091 <dbl>, and abbreviated
## #   variable names ¹X20011015.2226039991, ²X1613424, ³X20011204.1149509996
## # ℹ Use `colnames()` to see all variable names

Normalise.

Small subset for illustration.

subset_4 <- df_2 %>% 
  select(c(X807, X4518))

head(subset_4)

## # Source:   SQL [6 x 2]
## # Database: postgres  [vwrflyqyjdgszl@ec2-52-204-157-26.compute-1.amazonaws.com:5432/degtd5nnt804c6]
##     X807 X4518
##    <int> <int>
## 1  68350    24
## 2    898  1431
## 3  16193  5116
## 4   9633  3947
## 5 519929    11
## 6   4599  2881

Normalise.

subset_4_norm <- subset_4 %>%
  as_tibble() %>%
  mutate(across(where(is.numeric), scale))
head(subset_4_norm)

## # A tibble: 6 × 2
##   X807[,1] X4518[,1]
##      <dbl>     <dbl>
## 1    0.251   -0.697 
## 2   -0.551   -0.0312
## 3   -0.369    1.71  
## 4   -0.447    1.16  
## 5    5.61    -0.703 
## 6   -0.507    0.655

4.3.2 Visualisations

Not sure what these mean.

For illustration only.

library(ggplot2)

ggplot(df_2) + aes(x = X1613424) + geom_histogram() +
  ggtitle("Histogram of X1613424")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_2) + aes(x = X1613424, y = X4518) +
  geom_point(shape = 1, colour = "blue") + ggtitle("Scatter plot")

4.3.3 A model

Our favourite seed :-)

set.seed(666)

Create an ID.

df_2 <- df_2 %>% mutate(id = row_number())

Check ID variable.

head(df_2)

## # Source:   SQL [6 x 17]
## # Database: postgres  [vwrflyqyjdgszl@ec2-52-204-157-26.compute-1.amazonaws.com:5432/degtd5nnt804c6]
##      X1 X1613423   X807 X2001101…¹ X1613…² X4518    X0  X0.1  X654 X1395 X2001…³
##   <int>    <int>  <int>      <dbl>   <int> <int> <int> <int> <int> <int>   <dbl>
## 1     0  2312066  68350  20030125. 2312067    24     0     0     0     5  2.00e7
## 2     0  1788986    898  20011109. 1788987  1431     0     0    21   392  2.00e7
## 3     0   881270  16193  20011005.  881187  5116     2     0   859  1579  2.00e7
## 4     2  1656019   9633  20011116. 1656020  3947     0     0   431  1258  2.00e7
## 5     0  2313712 519929  20030127. 2313713    11     0     0     0     0  2.00e7
## 6     1   950310   4599  20011007.  950316  2881     0     0   472  1052  2.00e7
## # … with 6 more variables: X10.955403 <dbl>, X2288071 <int>,
## #   X.0.28820264 <dbl>, X0.40731233 <dbl>, X10.559091 <dbl>, id <int64>, and
## #   abbreviated variable names ¹X20011015.2226039991, ²X1613424,
## #   ³X20011204.1149509996
## # ℹ Use `colnames()` to see all variable names

Create training set.

set.seed(666)
train_df <- sample_frac(as_tibble(df_2), 0.6)
head(train_df)

## # A tibble: 6 × 17
##      X1 X1613423  X807 X20011015…¹ X1613…² X4518    X0  X0.1  X654 X1395 X2001…³
##   <int>    <int> <int>       <dbl>   <int> <int> <int> <int> <int> <int>   <dbl>
## 1     4  1070192 11534   20011021. 1070193  5225     0     0   872  1679  2.00e7
## 2     0  1647404 46077   20011114. 1647405     2     0     0     0     0  2.00e7
## 3     1  1105542  3057   20011030. 1105548  4252     0     0   611  1410  2.00e7
## 4     1  1784081  2704   20010927. 1784080  2399     0     0   258   897  2.00e7
## 5     0  1759349 19047   20011115. 1759350     8     0     0     0     0  2.00e7
## 6     0  1055683 37323   20011022. 1055684     0     0     0     0     0  2.00e7
## # … with 6 more variables: X10.955403 <dbl>, X2288071 <int>,
## #   X.0.28820264 <dbl>, X0.40731233 <dbl>, X10.559091 <dbl>, id <int64>, and
## #   abbreviated variable names ¹X20011015.2226039991, ²X1613424,
## #   ³X20011204.1149509996
## # ℹ Use `colnames()` to see all variable names

Create test set.

valid_df  <- anti_join(as_tibble(df_2), train_df, by = 'id')
head(valid_df)

## # A tibble: 6 × 17
##      X1 X1613423   X807 X2001101…¹ X1613…² X4518    X0  X0.1  X654 X1395 X2001…³
##   <int>    <int>  <int>      <dbl>   <int> <int> <int> <int> <int> <int>   <dbl>
## 1     0  1788986    898  20011109. 1788987  1431     0     0    21   392  2.00e7
## 2     0   881270  16193  20011005.  881187  5116     2     0   859  1579  2.00e7
## 3     1   950310   4599  20011007.  950316  2881     0     0   472  1052  2.00e7
## 4     4  1739747  24471  20011124. 1739748  3666     0     0   422  1567  2.00e7
## 5     0  2314662 271890  20030127. 2314663    73     1     0     0    33  2.00e7
## 6     0   689758  16009  20011009.  689757     3     0     0     0     0  2.00e7
## # … with 6 more variables: X10.955403 <dbl>, X2288071 <int>,
## #   X.0.28820264 <dbl>, X0.40731233 <dbl>, X10.559091 <dbl>, id <int64>, and
## #   abbreviated variable names ¹X20011015.2226039991, ²X1613424,
## #   ³X20011204.1149509996
## # ℹ Use `colnames()` to see all variable names

The model.

regression_model_3 <- lm(X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, data = train_df)
summary(regression_model_3)

## 
## Call:
## lm(formula = X10.559091 ~ X10.955403 + X2288071 + X.0.28820264, 
##     data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -228.207   -6.955   -0.187   11.085  165.822 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   1.128e-01  5.289e+00    0.021    0.983    
## X10.955403   -6.145e-05  3.512e-03   -0.017    0.986    
## X2288071     -6.667e-07  1.765e-06   -0.378    0.706    
## X.0.28820264  9.998e-01  2.886e-04 3464.307   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.19 on 948 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 4.118e+06 on 3 and 948 DF,  p-value: < 2.2e-16

The prediction.

library(forecast)

regression_pred_3 <- predict(regression_model_3, valid_df)

accuracy(regression_pred_3, valid_df$X10.559091)

##                ME     RMSE      MAE      MPE     MAPE
## Test set 2.607477 41.56346 22.45601 76.45489 79.26123

Close the connection.

dbDisconnect(con)

Make a wish upon a star.

Starry, Starry Night

master yeoda

a long, long time ago in a galaxy far, far away

Directions

1. Load data

2. Some analyses

2.1 Exploration

2.2 Visualisation

2.3 Random sample

2.4 Training validation split

3. A model

4. PostgreSQL

4.1 Connect to the database

4.2 Work with database 1

4.2.1 Exploration

4.2.2 Visualisations

4.2.3 A model

4.3 Work with database 2

4.3.1 Exploration

4.3.2 Visualisations

4.3.3 A model