Dodatkowe tematy

Table of Contents

1 Manipulowanie danymi

  • data frames
  • tibble packages
  • dplyr package

1.1 Tibble

### Czyszczenie
rm(list = ls())

### Ramki danych
data.frame(id = 1:5, value = rnorm(5))
a <- data.frame(id = 1:100, value = rnorm(100))
a
head(a)
tail(a)
str(a)

### Czego nie można zrobić z ramką danych
b <- data.frame(
  x = rnorm(100),
  y = 1 + x + rnorm(100))

### Wgrywanie biblioteki
library(tibble)

### Tworzenie tibble'a
a <- tibble(
  x = rnorm(100),
  y = 1 + x + rnorm(100),
  classX = ifelse(x > 0, "a", "b"),
  classY = ifelse(y > 0, "A", "B"))

a
print(a, n = 30)

typeof(a)
class(a)

### Kowersja do tibble'a
b <- data.frame(id = 1:100, value = rnorm(100))
class(b)
bTB <- as_tibble(b)
bTB
class(bTB)

### Indeksowanie tibble'a jest identyczne jak w przypadku ramki danych
class(a)
names(a)
a$x
a[c("x", "classY")]
a["x"]
a[a$x > a$y, "classX"]
a[["x"]]

### Typ bazowy jest standardowy
typeof(a)

1.2 Pakiet dplyr

Działanie pakietu dplyr bazuje głównie na wykorzystaniu następujący operatorów

  • %>%
  • select()
  • filter()
  • arrange()
  • mutate()
  • summarise()
  • group_by()
  • pull()
  • operatory *_join
### Czyszczenie
rm(list = ls())

### Biblioteki
library(dplyr)

### Dane
d <- as_tibble(read.csv("http://michal.ramsza.org/lectures/2_r_programming/data/data_2.csv"))
dim(d)
d

### select()
select(.data = d, "Price", "Mileage")

d %>% select("Price", "Mileage")
d %>% select(Price, Mileage)

plot(d %>% select("Mileage", "Price"))
plot(d %>% select("Mileage", "Price"),
  col = rgb(0, 0, 1, 0.1), pch = 20)

d
d %>% select(-Brand, -Model, -Negotiation, -City)
d %>% select(Gas_type:Price)
d %>% select(starts_with("M"))
d %>% select(one_of("Price", "Mileage", "Owner"))
suppressWarnings(d %>% select(one_of("Price", "Mileage", "Owner")))
d[, c("Price", "Mileage", "Owner")] # fails

### filter
d %>% filter(Price < 1000)
sort(unique(d$Brand))
d %>% filter(Price < 2000 & Brand == "Audi")
d %>% filter(Price < 1000 & Brand == "Fiat" | Price < 2000 & Brand == "Audi")

d %>%
  filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>%
  select(Price) %>%
  pull(Price)

### Przykład
colBorder = rgb(1, 0, 0, .8)
colMain = rgb(1, 0, 0, .2)

hist(
  d %>%
    filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>%
    select(Price) %>%
    pull(Price),
  probability = TRUE,
  breaks = "Scott",
  col = colMain, border = colBorder,
  main = "Rozkład cen",
  xlab = "cena",
  ylab = "gęstość"
)
grid(lty = "solid", col = colMain)

### Przykład
colBorder = rgb(1, 0, 0, .8)
colMain = rgb(1, 0, 0, .2)
colBorder2 = rgb(0, 1, 0, .8)
colMain2 = rgb(0, 1, 0, .2)

audi <- d %>%
  filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>%
  select(Price) %>%
  pull(Price)

fiat <- d %>%
  filter(Brand == "Fiat" & Voivodeship == "Mazowieckie") %>%
  select(Price) %>%
  pull(Price)

par(mfrow = c(2, 1))

hist(
  audi,
  probability = TRUE,
  breaks = seq(min(audi), max(audi), length.out = 100),
  col = colMain, border = colBorder,
  main = "Rozkład cen Audi",
  xlab = "cena",
  ylab = "gęstość"
)
grid(lty = "solid", col = colMain)

hist(
  fiat,
  probability = TRUE,
  breaks = seq(min(fiat), max(fiat), length.out = 100),
  col = colMain2, border = colBorder2,
  main = "Rozkład cen Fiat",
  xlab = "cena",
  ylab = "gęstość"
)
grid(lty = "solid", col = colMain2)

### Przykład
d1 <- d %>%
  select(Brand, Mileage, Price) %>%
  filter(Brand %in% c("Fiat", "Audi"))

xr <- range(pull(.data = d1, var = "Mileage"))
yr <- range(pull(.data = d1, var = "Price"))

par(mfrow = c(1, 1))

plot(
  d1 %>% filter(Brand == "Audi") %>% select(Mileage, Price),
  pch = 20,
  col = rgb(1, 0, 0, .03)
)
points(
  d1 %>% filter(Brand == "Fiat") %>% select(Mileage, Price),
  pch = 20,
  col = rgb(0, 0, 1, .03)
)
lines(
  lowess(x = d1 %>% filter(Brand == "Audi") %>% select(Mileage, Price)),
  col = "red", lwd = 3)
lines(
  lowess(x = d1 %>% filter(Brand == "Fiat") %>% select(Mileage, Price)),
  col = "blue", lwd = 3)


### arrange
d %>% arrange(Price)
d %>% arrange(desc(Price))
d %>% arrange(Brand, desc(Price))
d %>%
  filter(Brand %in% c("Honda", "Fiat")) %>%
  arrange(desc(Price), Brand)

### mutate
d2 <- d %>%
  select(Year_prod, Price, Mileage) %>%
  mutate(MpY = Mileage / (2019 - Year_prod)) %>%
  select(MpY, Price)

plot(d2, col = rgb(0, 0, 1, 0.03), pch = 20, cex = 1)

### Przykład
d2 <- d %>%
  select(Year_prod, Price, Mileage) %>%
  mutate(
    MpY = Mileage / (2019 - Year_prod),
    Class = ifelse(abs(mean(Price)-Price) > sd(Price), "Red", "Blue")
  ) %>%
  select(MpY, Price, Class)

plot(d2 %>% select(MpY, Price), col = pull(d2, "Class"), pch = 20, cex = 1)
legend(x = "topright", legend = c("Wysoka cena", "Cena w normie"), fill = c("red", "blue"))

### summarise
d %>% summarise(
        "średni przebieg" = mean(Mileage),
        "średnia cena" = mean(Price),
        "minimalna cena" = min(Price),
        "maksymalna cena" = max(Price)
      )

### group_by
d %>%
  group_by(Brand) %>%
  summarise(mean_price = mean(Price), std_dev = sd(Price)) %>%
  arrange(Brand, mean_price)

d %>%
  group_by(Voivodeship) %>%
  summarise(mean_price = mean(Price)) %>%
  arrange(desc(mean_price))

d %>%
  group_by(Voivodeship) %>%
  summarise(num_of_offers = n()) %>%
  arrange(desc(num_of_offers))

d %>%
  mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>%
  group_by(Voivodeship, newCar) %>%
  summarise(num_of_offers = n()) %>%
  arrange(Voivodeship, desc(num_of_offers))

d %>%
  mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>%
  group_by(Voivodeship, newCar) %>%
  summarise(num_of_offers = n()) %>%
  arrange(Voivodeship, desc(num_of_offers)) %>%
  ungroup()


### Przykład
d %>%
  mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>%
  group_by(Voivodeship, newCar) %>%
  summarise(num_of_offers = n(), .groups = "drop") %>%
  arrange(Voivodeship, desc(num_of_offers))

### Przykład
d1 <- d %>%
  mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>%
  group_by(Voivodeship, newCar) %>%
  summarise(num_of_offers = n(), .groups = "drop") %>%
  arrange(Voivodeship, desc(num_of_offers))

d1

tibble(
  voi = d1 %>% filter(newCar == 0) %>% select(Voivodeship) %>% pull(),
  oldCarN = d1 %>% filter(newCar == 0) %>% select(num_of_offers) %>% pull(),
  newCarN = d1 %>% filter(newCar == 1) %>% select(num_of_offers) %>% pull()
) %>%
  arrange(desc(oldCarN))

### Łączenie tabel *_join
a <- tibble(id = (1:5)[-2], name = LETTERS[1:4], value = rnorm(4))
a

b <- tibble(
  id = 1:4,
  nameSmall = sample(letters[1:2], 4, replace = TRUE),
  value2 = rbinom(4, 10, .5))
b

left_join(a, b, by = "id")
right_join(a, b, by = "id")
inner_join(a, b, by = "id")
full_join(a, b, by = "id")

a %>%
  left_join(b, by = "id") %>%
  select(id, value, value2)

2 Grafika

2.1 Standardowa grafika

  • standard stack (par(), plot(), lines(), …)
### Czyszczenie
rm(list = ls())

### Pakiety
library(dplyr)

### Przykładowe dane
data(iris)
d <- as_tibble(iris)
dd <- d %>% select(-Species)
ddd <- cmdscale(dist(dd), eig = TRUE, k = 2)$points

### Przykładowy wykres
plot(ddd)

plot(ddd,
  pch = 20,
  cex = 2,
  col = rgb(1, 0, 0, .5),
  xlab = "wymiar 1",
  ylab = "wymiar 2",
  las = 1)
grid()

### Lista wszystkich opcji
.Pars
graphics::.Pars
graphics:::.Pars
length(graphics:::.Pars)

### Lista wartości dla wszystkich opcji
par()
names(par())
graphics:::.Pars

### Zmiana wartości dla przykładowej opcji
par()$fg
par(fg = "magenta")
par()$fg

plot(ddd,
  pch = 20,
  cex = 2,
  col = rgb(1, 0, 0, .5),
  xlab = "wymiar 1",
  ylab = "wymiar 2",
  las = 1)
grid()

?par

### Nieco bardziej skomplikowana grafika
par(mai = c(1, 1, 2, .5), bg = rgb(.1, .1, .1) )
plot(ddd,
  main = "Reduced data",
  xlab = "Reduced dim 1",
  ylab = "Reduced dim 2",
  xlim = c(-4, 4),
  ylim = c(-4, 4),
  pch = 20,
  cex = 2,
  col = rgb(0, 0, 1, .8),
  bty = "7",
  cex.main = 4,
  cex.axis = .5,
  cex.lab = 2,
  col.axis = rgb(0, 1, 1),
  col.main = rgb(0, 0.8, 1),
  col.lab = "red",
  fg = "red",
  las = 1,
  xaxp = c(-4, 4, 10),
  yaxp = c(-4, 4, 20)
)
grid(col = "white", lty = 2, lwd = 0.5)

### Nieco bardziej przemyślany przykład
names(iris)
iris$Species
d1 <- iris %>% filter(Species == "setosa") %>% select(Sepal.Length, Sepal.Width)
d2 <- iris %>% filter(Species == "versicolor") %>% select(Sepal.Length, Sepal.Width)
head(d1)
head(d2)

xrange <- range(d1$Sepal.Length, d2$Sepal.Length)
yrange = range(d1$Sepal.Width, d2$Sepal.Width)

s <- 0.8 # margins
k <- .9  # color
par(mfrow = c(2, 1),
  oma = c(2, 1, 1, 2),
  mai = c(s, 1, s/2, 1),
  cex.axis = .9,
  bg = "white", fg = "black")
plot(d1,
  xlim = xrange,
  ylim = yrange,
  pch = 20,
  col = rgb(k, 0, 1, .5),
  font.lab = 3
)
mtext(text = "Setosa",
  outer = FALSE,
  line = 1,
  side = 4,
  cex = 2,
  col = rgb(k, 0, 1, .2),
  font = 4)
abline(lm(Sepal.Width ~ Sepal.Length, data = d1),
  col = rgb(k, 0, 1, .4), lty = 4)
plot(d2,
  xlim = xrange,
  ylim = yrange,
  pch = 20,
  col = rgb(0, k, 1, .5),
  font.lab = 3
)
mtext(text = "Versicolor",
  outer = FALSE,
  line = 1,
  side = 4,
  cex = 2,
  col = rgb(0, k, 1, .2),
  font = 4)
abline(lm(Sepal.Width ~ Sepal.Length, data = d2),
  col = rgb(0, k, 1, .4), lty = 4)

dev.copy(device = png, "./fig.png", width = 600, height = 1200)
dev.off()

2.2 Pakiet ggplot2

### Czyszczenie
rm(list = ls())

### Pakiety
library(dplyr)
library(ggplot2)

### Dane
d <- as_tibble(read.csv("http://michal.ramsza.org/lectures/2_r_programming/data/data_2.csv"))

### Pierwszy wykres / gdzie jest wykres?
p1 <- ggplot(data = d, aes(x = Mileage, y = Price)) +
  geom_point()

typeof(p1)
names(p1)
p1$labels
p1$labels$x <- "Przebieg samochodu"
print(p1)


### Bardziej zaawansowane mapowania i geomy
ggplot(
  data = d %>% filter(Brand %in% c("Audi", "Fiat")),
  aes(x = Mileage, y = Price, color = Brand)) +
  geom_point(alpha = 0.4)

ggplot(
  data = d %>% filter(Brand %in% c("Audi", "Fiat")),
  aes(x = Mileage, y = Price, color = Mileage, alpha = Price)) +
  geom_point()


d1 <- d %>% filter(Brand %in% c("Fiat", "Audi"))
ggplot(data = d1, aes(x = Mileage, y = Price, color = Brand, size = Engine_capacity)) +
  geom_point(alpha = 0.4)

ggplot(d, aes(x = Mileage)) + geom_histogram(binwidth = 10000)
ggplot(d, aes(x = Mileage)) + geom_freqpoly(binwidth = 1000)

d1 <- d %>% select(Brand) %>%
  filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota")) %>%
  arrange(Brand)

ggplot(d1, aes(x = Brand)) +
  geom_bar(color = "red", fill = "green") # liczba elementów

d1 <- d %>% select(Brand, Price) %>%
  filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota")) %>%
  group_by(Brand) %>%
  summarise(mean_price = mean(Price))
d1

ggplot(d1, aes(x = Brand, y = mean_price)) +
  geom_bar(stat = "identity") # podana statystyka

d1 <- d %>% select(Brand, Price) %>%
  filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota"))
d1

ggplot(d1, aes(x = Brand, y = Price)) + geom_boxplot()
ggplot(d1, aes(x = Brand, y = Price)) + geom_jitter()
ggplot(d1, aes(x = Brand, y = Price)) + geom_violin()

d1 <- d %>% select(Brand, Price) %>%
  filter(Brand %in% c("Audi", "Fiat"))
d1

### faceting
ggplot(d1, aes(x = Price)) +
  geom_histogram(bins = 200) +
  facet_wrap(~ Brand, ncol = 1)

### proste zmiany w stylu
d3 <- d %>% select(Year_prod, Price, Brand) %>%
  group_by(Year_prod, Brand) %>%
  summarise(meanPrice = mean(Price)) %>%
  arrange(Brand, Year_prod)
d3

g <- ggplot(d3, aes(x = Year_prod, y = meanPrice, color = Brand)) +
  geom_point() +
  geom_line() + 
  xlab("year of production") + 
  ylab("average price") +
  xlim(1990, 2010) + 
  ylim(NA, 10^6) +
  labs(title = "Average yearly prices", subtitle = "Data for used cars", caption = "Crash course on R")

print(g)

### mniej proste zmiany stylu
g + theme(
  plot.title=element_text(size=20, 
    face="bold", 
    family="American Typewriter",
    color="tomato",
    hjust=0.5,
    lineheight=1.2),  # title
  plot.subtitle=element_text(size=15, 
    family="American Typewriter",
    face="bold",
    hjust=0.5),  # subtitle
  plot.caption=element_text(size=15),  # caption
  axis.title.x=element_text(vjust=10,  
    size=15),  # X axis title
  axis.title.y=element_text(size=15),  # Y axis title
  axis.text.x=element_text(size=10, 
    angle = 30,
    vjust=.5),  # X axis text
  axis.text.y=element_text(size=10))  # Y axis text

2.3 Tworzenie wykresów z mapami

Czasami trzeba stworzyć wykres, który jest mapą. Poniżej jest prosty przykład tworzenia takich wykresów bez wykorzytania zewnętrznych plików z danymi dotyczącymi mapy.

### Biblioteki
library(ggplot2)
library(grid)
library(dplyr)
library(rworldmap)

### Tworzenie danych do mapy
worldMap <- getMap()

### Definiowanie nazw krajów EU (poza Luksemburg, Malta oraz Cypr)
europeanUnion <- c(
  "Austria","Belgium","Bulgaria","Croatia",
  "Czech Rep.","Denmark","Estonia","Finland","France",
  "Germany","Greece","Hungary","Ireland","Italy","Latvia",
  "Lithuania","Netherlands","Poland",
  "Portugal","Romania","Slovakia","Slovenia","Spain",
  "Sweden")

indEU <- which(worldMap$NAME %in% europeanUnion)

## Współrzędne geograficzne
europeCoords <- lapply(
  X = indEU,
  FUN = function(i){
    df <- data.frame(worldMap@polygons[[i]]@Polygons[[1]]@coords)
    df$region =as.character(worldMap$NAME[i])
    colnames(df) <- list("long", "lat", "region")
    return(df)
  }
)

europeCoords <- do.call("rbind", europeCoords)

### Nadawanie przykładowych wartości
countryData <- tibble(
  countryName = europeanUnion,
  value = ifelse(rnorm(length(countryName)) < 0, "Negative", "Positive")
)

europeCoords$value <- countryData$value[match(europeCoords$region,countryData$countryName)]

### Tworzenie przykładowej mapy
P <- ggplot() +
  geom_polygon(
    data = europeCoords,
    aes(x = long, y = lat, group = region, fill = value),
    colour = "black", size = 0.2) +
  coord_map(xlim = c(-13, 35),  ylim = c(32, 71))
P <- P +
  scale_fill_manual(
    name = "Country class",
    values = c("blue", "red", "green"))
P <- P +
  theme(
    panel.grid.minor = element_line(colour = NA),
    panel.background = element_rect(fill = NA, colour = NA),
    axis.text.x = element_blank(),
    axis.text.y = element_blank(), axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(), axis.title = element_blank(),
    rect = element_blank(),
    plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines"))
print(P)

3 Import i eksport danych

3.1 Text based formats

  • read.csv, write.csv
  • read.table, write.table
### Czyszczenie
rm(list = ls())

### Przykładowe dane
d <- data.frame(
    id = 1:5,
    value = rnorm(5),
    class = sample(letters[1:2], 5, replace = TRUE))

d

### CSV
write.csv(file = "./dane.csv", d)
system("ls -l", intern = TRUE)
system("cat ./dane.csv", intern = TRUE)

read.csv(file = "./dane.csv")

### write.table
write.table(file = "./dane1.data",
  x = d,
  sep = " $ ",
  dec = "@",
  row.names = FALSE,
  fileEncoding = "UTF-8")

system("cat ./dane1.data", intern = TRUE)

read.table(file = "./dane1.data",
  sep = "$",
  dec = "@",
  header = TRUE,
  fileEncoding = "UTF-8")

3.2 XLSX format

  • openxlsx package
### Czyszczenie
rm(list = ls())

### Biblioteki
library(openxlsx)
library(dplyr)

### -----------------------------------------------------------------
### Czytanie

data1 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI",
  startRow = 1,
  colNames = TRUE,
  rowNames = FALSE)

head(data1)

data2 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",a
  sheet = "CPI",
  startRow = 1,
  colNames = FALSE,
  rowNames = FALSE)

head(data2)

data3 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI",
  startRow = 2,
  colNames = FALSE,
  rowNames = FALSE)

head(data3)

data4 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI2",
  startRow = 1,
  colNames = TRUE,
  rowNames = TRUE)

head(data4)

### Czytanie fragmentów
data5 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI2",
  colNames = TRUE,
  rows = 1:5)

data5

data6 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI2",
  colNames = TRUE,
  rows = 1:5,
  cols = 2:4)

data6

data7 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheet = "CPI3",
  colNames = TRUE,
  startRow = 1,
  skipEmptyRows = TRUE)

data7

data8 <- read.xlsx(
  xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",  
  sheet = "CPI3",
  colNames = TRUE,
  startRow = 1,
  skipEmptyRows = FALSE)

data8

### Bardziej skomplikowany przykład
readClientTable <- function(fileName, sheetName, xrow, xcol){
  ## czytanie nazwy klienta
  klient <- unlist(read.xlsx(xlsxFile = fileName,
    sheet = sheetName,
    colNames = FALSE,
    rowNames = FALSE,
    rows = xrow,
    cols = xcol))
  ## czytanie lat
  lata <- unlist(read.xlsx(xlsxFile = fileName,
    sheet = sheetName,
    colNames = FALSE,
    rowNames = FALSE,
    rows = xrow + 1,
    cols = (xcol + 1):(xcol + 5)))
  ## czytanie etykiet
  kategorie <- unlist(read.xlsx(xlsxFile = fileName,
    sheet = sheetName,
    colNames = FALSE,
    rowNames = FALSE,
    rows = (xrow + 2):(xrow + 4),
    cols = xcol))
  ## czytanie wartości
  wartosci <- as.matrix(read.xlsx(xlsxFile = fileName,
    sheet = sheetName,
    colNames = FALSE,
    rowNames = FALSE,
    rows = (xrow + 2):(xrow + 4),
    cols = (xcol + 1):(xcol + 5)))
  ## tworzenie macierzy
  colnames(wartosci) <- lata
  row.names(wartosci) <- kategorie
  wartosci
}

readClientTable(
  fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheetName = "Data", xrow = 5, xcol = 3)

readClientTable(
  fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheetName = "Data", xrow = 5, xcol = 11)

readClientTable(
  fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheetName = "Data", xrow = 12, xcol = 3)

readClientTable(
  fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",
  sheetName = "Data", xrow = 12, xcol = 11)

### -----------------------------------------------------------------
### Tworzenie raportów w XLSX

### Przykład prosty

data <- data.frame(label = letters) %>%
  mutate(x = rnorm(length(label))) %>%
  mutate(y = 2 * x + 1 + rnorm(length(x)))
head(data)

write.xlsx(x = data, file = "./data_3.xlsx")

write.xlsx(x = data, file = "./data_3a.xlsx", asTable = TRUE)

write.xlsx(x = data,
  file = "./data_4.xlsx",
  asTable = TRUE,
  title = "The X data",
  creator = "Michał Ramsza",
  sheetName = "Some data",
  gridLines = FALSE,
  tabColour = "red"
)

### -----------------------------------------------------------------
### Zrobienie znacznie bardziej skomplikowanego raportu
data <- data.frame(x = rnorm(500)) %>%
  mutate(y = 2 * x + 0.3 * x^2 - 1 + rnorm(500))
head(data)

### Tworzenie wirtualnego pliku
wb <- createWorkbook(
  creator = "Michał Ramsza",
  title = "Example workbook",
  subject = "Linear regression",
  category = "Workshop"
)

wb

### Dodawanie arkuszy
addWorksheet(wb = wb,
  sheetName = "Linear regression",
  tabColour = "blue",
  gridLines = FALSE,
  header = c("Left", "&[Date]", "Right"),
  footer = c("Left", "Środek", "Right"),
  visible = TRUE)

wb

styl1 <- createStyle(
  fontName = "Calibri",
  fontSize = 14,
  fontColour = "tomato1",
  numFmt = "GENERAL",
  fgFill = "snow2",
  borderColour = "snow3",
  halign = "center",
  valign = "center",
  )

styl2 <- createStyle(
  fontName = "Calibri",
  fontSize = 11,
  fontColour = "black",
  numFmt = "NUMBER",
  border = "TopBottomLeftRight",
  borderColour = "black",
  halign = "center",
  valign = "center",
  )

etykiety <- names(data)
dim(etykiety) <- c(1, 2)

writeData(wb = wb, sheet = 1,
  x = etykiety, startCol = 2,
  startRow = 2,colNames = FALSE, rowNames = FALSE)

addStyle(wb = wb, sheet = 1, style = styl1, rows = 2,
  cols = 2:3, gridExpand = TRUE)
setRowHeights(wb = wb, sheet = 1, rows = 2, heights = 40)
setColWidths(wb = wb, sheet = 1, cols = 2:3, widths = 20)

writeData(wb = wb, sheet = 1, x = data,
  startCol = 2, startRow = 3, colNames = FALSE,
  rowNames = FALSE)
addStyle(wb = wb, sheet = 1, style = styl2,
  rows = 3:502, cols = 2:3, gridExpand = TRUE)

plot(data, pch = 20, cex = 2, col = rgb(0, 0, 1, 0.2),
  main = "Comparison of linear regression\nand polynomial smoothing")
grid()
abline(lm(y ~ x, data = data), col = "blue", lty = "dashed")
lines(lowess(x = data$x, y = data$y, f = 1/2))
legend(x = "topleft",
  legend = c("linear regression", "LOWESS"),
  col = c("blue", "black"),
  lty = c("dashed", "solid"),
  bg = "white",
  y.intersp = 2,
  text.width = 2,
  bty = "o")
insertPlot(wb = wb, sheet = 1, startRow = 2, startCol = 5,
  width = 6, height = 5, dpi = 600)
dev.off()

saveWorkbook(wb = wb, file = "./report.xlsx", overwrite = TRUE)

3.3 JSON

### Biblioteki
library(rjson)
library(dplyr)

### -----------------------------------------------------------------
### Pakiet rjson

### Wczytywanie danych
d1 <- rjson::fromJSON(file = "http://michal.ramsza.org/lectures/2_r_programming/data/data_3.json")
d2 <- as_tibble(d1)
d2

### Saving to a file
write(
  x = rjson::toJSON(
               x = list(
                 id = 1:5,
                 value = rnorm(5),
                 class = sample(letters[1:2], size = 5, replace = TRUE))),
  file = "./data_test.json")

### -----------------------------------------------------------------
### Pakiet jsonlite
library(jsonlite)

### Wczytywanie danych
d3 <- as_tibble(jsonlite::fromJSON(txt = "http://michal.ramsza.org/lectures/2_r_programming/data/data_3.json"))
d3

### Konwersja danych do sensowych typów
### TODO: Poprawić!
toDate <- function(d) {
  ret <- unlist(strsplit(x = d, split = "/", fixed = TRUE))
  ret <- paste(
    formatC(x = ret[[3]], width = 4, flag = "0"),
    formatC(x = ret[[1]], width = 2, flag = "0"),
    formatC(x = ret[[2]], width = 2, flag = "0"),
    sep = "-")
  as.POSIXct(ret)
}

d3a <- d3 %>%
  mutate(Salary = as.double(Salary)) %>%
  mutate(Date = toDate(StartDate)) %>%
  select(Dept, Name, Salary, Date) %>%
  arrange(Dept, desc(Salary))
d3a

### Zapisywanie do pliku
write(
  x = jsonlite::toJSON(
                  x = list(
                    id = 1:5,
                    value = rnorm(5),
                    class = sample(letters[1:2], size = 5, replace = TRUE))),
      file = "./data_test_2.json")

Poniżej jest nieco bardziej skomplikowany dokumenty JSON (NDJSON).

### Pakiety
library(jsonlite)
library(purrr)
library(dplyr)
library(ggplot2)

### Typowe czytanie nie będzie działało
jsonlite::fromJSON("http://michal.ramsza.org/lectures/2_r_programming/data/data_4.json")

### Czytanie linia po linii (każda linia to obiekt JSON)
readNDJSON <- function(path){
  ## prosty operator łączenia
  `%C%` <- c

  ## otwieranie połączenia do pliku
  conn <- file(description = path, open = "r")

  ## Wczytywanie po linii
  ret <- list()
  while(TRUE){
    d <- readLines(conn, n = 1)
    if(length(d) == 0 ){
      break
    }
    ret <- ret %C% list(fromJSON(d))
  }

  ## zamykanie połączenia
  close(conn)

  ## zwracanie wczytanych wartości
  ret
}

### Wczytanie pliku w formacie NDJSON
test <- suppressWarnings(
  readNDJSON(path = "http://michal.ramsza.org/lectures/2_r_programming/data/data_4.json")
)

### Jaka jest struktura tego pliku
class(test)
typeof(test)
length(test)
names(test)
test[[1]]
names(test[[1]])
test[[1]]$url
names(test[[1]]$result)

test[[1]]$result$timestamp
test[[1]]$result$sequenceNumber
test[[1]]$result$pageData

names(test[[1]]$result$extractorData)

test[[1]]$result$extractorData$url
class(test[[1]]$result$extractorData$data)
unlist(test[[1]]$result$extractorData$data)

### Prosta analiza

### Upraszczanie wczytanej listy (flatten)
test <- lapply(X = test, FUN = purrr::flatten)

### Funkcja do wyciągania danych
extractInfo <- function(s) {
  temp <- s$extractorData$data[[1]][[1]][[1]]
  ell <- length(temp)
  c1 <- rep(s$url, ell)
  c2 <- unlist(lapply(temp, function(s){s$href}))
  c3 <- unlist(lapply(temp, function(s){s$text}))
  tibble(url = c1, href = c2, text = c3)
}

### Dla każdego obiektu JSON tworzymy tibble i łączymy wierszami
d <- bind_rows(lapply(test, extractInfo))
d

### Prosta analiza: długość tekstu
d1 <- d %>%
  select(text) %>%
  mutate(n = nchar(text), class = substr(text, 1, 1))
d1

ggplot(d1, aes(x = class, fill = n)) +
  geom_bar() +
  scale_y_continuous(breaks = 1:10)

3.4 Bazy danych (SQLite)

R może pracować z dużą liczbą różnych baz danych. Poniżej pokazane są przykłady dla relacyjnej proste bazy danych SQLite.

### Biblioteki
library(DBI)
library(RSQLite)

### Przykładowe dane
tab1 <- data.frame(id = 1:3, label = c("Lab 1", "Lab 2", "Lab 3"))
tab2 <- data.frame(value = rnorm(20), idlab = sample(1:3, size = 20, replace = TRUE))

### -----------------------------------------------------------------
### Tworzenie proste bazy danych

### Łączenie z bazą
driver <- dbDriver("SQLite")
database <- dbConnect(drv = driver, dbname = "./database.db")

### Zapisywanie obu tabel
dbWriteTable(conn = database, name = "xlabels", value = tab1)
dbWriteTable(conn = database, name = "xvalues", value = tab2)

### Zamykanie połączenia
dbDisconnect(conn = database)

### -----------------------------------------------------------------
### Sprawdzanie bazy danych

### Połączenie do bazy danych
database <- dbConnect(drv = driver, dbname = "./database.db")

### Lista wszystkich tabel
tabels <- dbListTables(conn = database)
tabels

### Pola w tabelach
lapply(tabels, function(x){dbListFields(conn = database, name = x)})

### Rozłączenie
dbDisconnect(conn = database)

### -----------------------------------------------------------------
### Czytanie z tabel

### Otwieranie połączenia
database <- dbConnect(drv = driver, dbname = "./database.db")

### Lista tabel
tabels <- dbListTables(conn = database)
tabels

### Czytanie z tabel
dbReadTable(conn = database, name = tabels[1] )
dbReadTable(conn = database, name = tabels[2] )

### Rozłączenie
dbDisconnect(conn = database)

Oprócz wykorzytania interfejsu z poziomu pakietu DBI można wykorzystać dowolny kod SQL.

### Połączenie
database <- dbConnect(drv = driver, dbname = "./database.db")

### Proste zapytanie SQL
query <-
  "SELECT xvalues.value, xlabels.label 
   FROM xvalues
   LEFT JOIN xlabels
   ON xvalues.idlab = xlabels.id
   ORDER BY xlabels.label ASC"

### Wykorzystanie zapytania
res <- dbGetQuery(conn = database, statement = query)
res

### Rozłączenie
dbDisconnect(conn = database)

### -----------------------------------------------------------------

### Przykładowe dane
data <- data.frame(value = rnorm(10), idlab = sample(1:3, 10, replace = TRUE) )
data

### Połączenie
database <- dbConnect(drv = driver, dbname = "./database.db")

### Wstawianie danych
`%G%` <- paste0
for(k in 1:dim(data)[1]){
  query <- "INSERT INTO xvalues (value,idlab) values (" %G%
    as.character(data$value[k]) %G% "," %G%
    as.character(data$idlab[k]) %G% ")"
  dbExecute(conn = database, statement = query)
}

### Rozłączenie
dbDisconnect(conn = database)

3.5 Automatyka raportowania stargazer

### Pakiety
library(stargazer)
library(dplyr)
library(ggplot2)

### Przykładowe dane
data(cars)
head(cars)

?stargazer

### Exporting data to LaTeX table
stargazer(cars[1:5,],
          out = "./latex/tab1.tex",
          summary = FALSE,
          rownames = FALSE,
          title = "First few rows of the dataframe used in the model.",
          align = TRUE,
          font.size = "small",
          no.space = TRUE,
          label = "tab:1")

### Basic model
m <- lm(dist ~ -1 + I(speed^2), data = cars)
m1 <- lm(dist ~ +1 + I(speed^2), data = cars)

### Predictions
ndata <- tibble(speed = seq(from = min(cars$speed), to = max(cars$speed), length.out = 50))
pred <- as_tibble(predict(object = m, newdata = ndata, interval = "prediction"))
pred

### All data
d <- as_tibble(cars %>% mutate(speedNew = ndata$speed))
d <- bind_cols(d, pred)
d

### Creating a nice image
ggplot(d, aes(x = speedNew, y = fit)) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "gray") +
  geom_line() + 
  geom_point(aes(x = speed, y = dist), color = "blue", pch = 4)

### Saving an image
ggsave(filename = "./latex/fig1.pdf")

### Creating a table for exporting
stargazer(m, m1,
          out = "./latex/tab2.tex",
          type = "latex",
          title = "Regression results",
          label = "tab:res",
          font.size = "small",
          no.space = TRUE,
          align = TRUE,
          dep.var.labels = "Braking distance",
          covariate.labels = c("Speed", "Constant")
          )

3.6 Eurostat API

Jako przykład wykorzytania API wykorzystami API Eurostatu do stworzenia proste analizy.

### Libraries
library(eurostat)
library(dplyr)
library(ggplot2)

### Get a list of all tables
toc <- get_eurostat_toc()
toc
toc %>% select(title, code)

### Let's look for GDP
res1 <- search_eurostat("GDP and main components")
res1
res1 %>% select(title, code)

### Downloading the data
d <- get_eurostat(id = "nama_10_gdp")
str(d)

d1 <- d %>% filter(na_item == "B1G", geo == "PL", unit == "CLV05_MEUR") %>%
  arrange(time) %>%
  mutate(year = as.integer(substr(time, 1, 4))) %>%
  select(year, values)

d1
print(d1, n = Inf)

### Ploting the data
ggplot(d1, aes(x = year, y = values)) +
  geom_point() +
  geom_line()

### Fitting a model
library(forecast)
d1Test <- d1 %>% filter(year < 2020)
m <- ets(y = d1Test$values, model = "ZZZ")
dp <- tibble(year = d1Test$year, value = m$fitted)
dp

### Tworzenie prognozy
last <- function(d) {d[length(d)]}

fc <- tibble(
  year = as.integer(last(d1Test$year) + 1),
  value = as.vector(forecast(object = m, h = 1)$mean))


d2 <- tibble(
  year = c(dp$year, fc$year),
  value = c(dp$value, fc$value)
)

d2

### Making another chart with fitted values
ggplot(d1, aes(x = year, y = values)) +
  geom_point() +
  geom_line() +
  geom_point(data = d2, aes(x = year, y = value), color = "red") +
  geom_line(data = d2, aes(x = year, y = value), color = "red")

Date: 2021-10-08 Fri 00:00

Author: Michał Ramsza

Created: 2021-11-29 Mon 15:36

Validate