Dodatkowe tematy
Table of Contents
1 Manipulowanie danymi
- data frames
tibble
packagesdplyr
package
1.1 Tibble
### Czyszczenie rm(list = ls()) ### Ramki danych data.frame(id = 1:5, value = rnorm(5)) a <- data.frame(id = 1:100, value = rnorm(100)) a head(a) tail(a) str(a) ### Czego nie można zrobić z ramką danych b <- data.frame( x = rnorm(100), y = 1 + x + rnorm(100)) ### Wgrywanie biblioteki library(tibble) ### Tworzenie tibble'a a <- tibble( x = rnorm(100), y = 1 + x + rnorm(100), classX = ifelse(x > 0, "a", "b"), classY = ifelse(y > 0, "A", "B")) a print(a, n = 30) typeof(a) class(a) ### Kowersja do tibble'a b <- data.frame(id = 1:100, value = rnorm(100)) class(b) bTB <- as_tibble(b) bTB class(bTB) ### Indeksowanie tibble'a jest identyczne jak w przypadku ramki danych class(a) names(a) a$x a[c("x", "classY")] a["x"] a[a$x > a$y, "classX"] a[["x"]] ### Typ bazowy jest standardowy typeof(a)
1.2 Pakiet dplyr
Działanie pakietu dplyr
bazuje głównie na wykorzystaniu następujący operatorów
%>%
select()
filter()
arrange()
mutate()
summarise()
group_by()
pull()
- operatory
*_join
### Czyszczenie rm(list = ls()) ### Biblioteki library(dplyr) ### Dane d <- as_tibble(read.csv("http://michal.ramsza.org/lectures/2_r_programming/data/data_2.csv")) dim(d) d ### select() select(.data = d, "Price", "Mileage") d %>% select("Price", "Mileage") d %>% select(Price, Mileage) plot(d %>% select("Mileage", "Price")) plot(d %>% select("Mileage", "Price"), col = rgb(0, 0, 1, 0.1), pch = 20) d d %>% select(-Brand, -Model, -Negotiation, -City) d %>% select(Gas_type:Price) d %>% select(starts_with("M")) d %>% select(one_of("Price", "Mileage", "Owner")) suppressWarnings(d %>% select(one_of("Price", "Mileage", "Owner"))) d[, c("Price", "Mileage", "Owner")] # fails ### filter d %>% filter(Price < 1000) sort(unique(d$Brand)) d %>% filter(Price < 2000 & Brand == "Audi") d %>% filter(Price < 1000 & Brand == "Fiat" | Price < 2000 & Brand == "Audi") d %>% filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>% select(Price) %>% pull(Price) ### Przykład colBorder = rgb(1, 0, 0, .8) colMain = rgb(1, 0, 0, .2) hist( d %>% filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>% select(Price) %>% pull(Price), probability = TRUE, breaks = "Scott", col = colMain, border = colBorder, main = "Rozkład cen", xlab = "cena", ylab = "gęstość" ) grid(lty = "solid", col = colMain) ### Przykład colBorder = rgb(1, 0, 0, .8) colMain = rgb(1, 0, 0, .2) colBorder2 = rgb(0, 1, 0, .8) colMain2 = rgb(0, 1, 0, .2) audi <- d %>% filter(Brand == "Audi" & Voivodeship == "Mazowieckie") %>% select(Price) %>% pull(Price) fiat <- d %>% filter(Brand == "Fiat" & Voivodeship == "Mazowieckie") %>% select(Price) %>% pull(Price) par(mfrow = c(2, 1)) hist( audi, probability = TRUE, breaks = seq(min(audi), max(audi), length.out = 100), col = colMain, border = colBorder, main = "Rozkład cen Audi", xlab = "cena", ylab = "gęstość" ) grid(lty = "solid", col = colMain) hist( fiat, probability = TRUE, breaks = seq(min(fiat), max(fiat), length.out = 100), col = colMain2, border = colBorder2, main = "Rozkład cen Fiat", xlab = "cena", ylab = "gęstość" ) grid(lty = "solid", col = colMain2) ### Przykład d1 <- d %>% select(Brand, Mileage, Price) %>% filter(Brand %in% c("Fiat", "Audi")) xr <- range(pull(.data = d1, var = "Mileage")) yr <- range(pull(.data = d1, var = "Price")) par(mfrow = c(1, 1)) plot( d1 %>% filter(Brand == "Audi") %>% select(Mileage, Price), pch = 20, col = rgb(1, 0, 0, .03) ) points( d1 %>% filter(Brand == "Fiat") %>% select(Mileage, Price), pch = 20, col = rgb(0, 0, 1, .03) ) lines( lowess(x = d1 %>% filter(Brand == "Audi") %>% select(Mileage, Price)), col = "red", lwd = 3) lines( lowess(x = d1 %>% filter(Brand == "Fiat") %>% select(Mileage, Price)), col = "blue", lwd = 3) ### arrange d %>% arrange(Price) d %>% arrange(desc(Price)) d %>% arrange(Brand, desc(Price)) d %>% filter(Brand %in% c("Honda", "Fiat")) %>% arrange(desc(Price), Brand) ### mutate d2 <- d %>% select(Year_prod, Price, Mileage) %>% mutate(MpY = Mileage / (2019 - Year_prod)) %>% select(MpY, Price) plot(d2, col = rgb(0, 0, 1, 0.03), pch = 20, cex = 1) ### Przykład d2 <- d %>% select(Year_prod, Price, Mileage) %>% mutate( MpY = Mileage / (2019 - Year_prod), Class = ifelse(abs(mean(Price)-Price) > sd(Price), "Red", "Blue") ) %>% select(MpY, Price, Class) plot(d2 %>% select(MpY, Price), col = pull(d2, "Class"), pch = 20, cex = 1) legend(x = "topright", legend = c("Wysoka cena", "Cena w normie"), fill = c("red", "blue")) ### summarise d %>% summarise( "średni przebieg" = mean(Mileage), "średnia cena" = mean(Price), "minimalna cena" = min(Price), "maksymalna cena" = max(Price) ) ### group_by d %>% group_by(Brand) %>% summarise(mean_price = mean(Price), std_dev = sd(Price)) %>% arrange(Brand, mean_price) d %>% group_by(Voivodeship) %>% summarise(mean_price = mean(Price)) %>% arrange(desc(mean_price)) d %>% group_by(Voivodeship) %>% summarise(num_of_offers = n()) %>% arrange(desc(num_of_offers)) d %>% mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>% group_by(Voivodeship, newCar) %>% summarise(num_of_offers = n()) %>% arrange(Voivodeship, desc(num_of_offers)) d %>% mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>% group_by(Voivodeship, newCar) %>% summarise(num_of_offers = n()) %>% arrange(Voivodeship, desc(num_of_offers)) %>% ungroup() ### Przykład d %>% mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>% group_by(Voivodeship, newCar) %>% summarise(num_of_offers = n(), .groups = "drop") %>% arrange(Voivodeship, desc(num_of_offers)) ### Przykład d1 <- d %>% mutate(newCar = ifelse(Mileage < 20000, 1, 0)) %>% group_by(Voivodeship, newCar) %>% summarise(num_of_offers = n(), .groups = "drop") %>% arrange(Voivodeship, desc(num_of_offers)) d1 tibble( voi = d1 %>% filter(newCar == 0) %>% select(Voivodeship) %>% pull(), oldCarN = d1 %>% filter(newCar == 0) %>% select(num_of_offers) %>% pull(), newCarN = d1 %>% filter(newCar == 1) %>% select(num_of_offers) %>% pull() ) %>% arrange(desc(oldCarN)) ### Łączenie tabel *_join a <- tibble(id = (1:5)[-2], name = LETTERS[1:4], value = rnorm(4)) a b <- tibble( id = 1:4, nameSmall = sample(letters[1:2], 4, replace = TRUE), value2 = rbinom(4, 10, .5)) b left_join(a, b, by = "id") right_join(a, b, by = "id") inner_join(a, b, by = "id") full_join(a, b, by = "id") a %>% left_join(b, by = "id") %>% select(id, value, value2)
2 Grafika
2.1 Standardowa grafika
- standard stack (
par()
,plot()
,lines()
, …)
### Czyszczenie rm(list = ls()) ### Pakiety library(dplyr) ### Przykładowe dane data(iris) d <- as_tibble(iris) dd <- d %>% select(-Species) ddd <- cmdscale(dist(dd), eig = TRUE, k = 2)$points ### Przykładowy wykres plot(ddd) plot(ddd, pch = 20, cex = 2, col = rgb(1, 0, 0, .5), xlab = "wymiar 1", ylab = "wymiar 2", las = 1) grid() ### Lista wszystkich opcji .Pars graphics::.Pars graphics:::.Pars length(graphics:::.Pars) ### Lista wartości dla wszystkich opcji par() names(par()) graphics:::.Pars ### Zmiana wartości dla przykładowej opcji par()$fg par(fg = "magenta") par()$fg plot(ddd, pch = 20, cex = 2, col = rgb(1, 0, 0, .5), xlab = "wymiar 1", ylab = "wymiar 2", las = 1) grid() ?par ### Nieco bardziej skomplikowana grafika par(mai = c(1, 1, 2, .5), bg = rgb(.1, .1, .1) ) plot(ddd, main = "Reduced data", xlab = "Reduced dim 1", ylab = "Reduced dim 2", xlim = c(-4, 4), ylim = c(-4, 4), pch = 20, cex = 2, col = rgb(0, 0, 1, .8), bty = "7", cex.main = 4, cex.axis = .5, cex.lab = 2, col.axis = rgb(0, 1, 1), col.main = rgb(0, 0.8, 1), col.lab = "red", fg = "red", las = 1, xaxp = c(-4, 4, 10), yaxp = c(-4, 4, 20) ) grid(col = "white", lty = 2, lwd = 0.5) ### Nieco bardziej przemyślany przykład names(iris) iris$Species d1 <- iris %>% filter(Species == "setosa") %>% select(Sepal.Length, Sepal.Width) d2 <- iris %>% filter(Species == "versicolor") %>% select(Sepal.Length, Sepal.Width) head(d1) head(d2) xrange <- range(d1$Sepal.Length, d2$Sepal.Length) yrange = range(d1$Sepal.Width, d2$Sepal.Width) s <- 0.8 # margins k <- .9 # color par(mfrow = c(2, 1), oma = c(2, 1, 1, 2), mai = c(s, 1, s/2, 1), cex.axis = .9, bg = "white", fg = "black") plot(d1, xlim = xrange, ylim = yrange, pch = 20, col = rgb(k, 0, 1, .5), font.lab = 3 ) mtext(text = "Setosa", outer = FALSE, line = 1, side = 4, cex = 2, col = rgb(k, 0, 1, .2), font = 4) abline(lm(Sepal.Width ~ Sepal.Length, data = d1), col = rgb(k, 0, 1, .4), lty = 4) plot(d2, xlim = xrange, ylim = yrange, pch = 20, col = rgb(0, k, 1, .5), font.lab = 3 ) mtext(text = "Versicolor", outer = FALSE, line = 1, side = 4, cex = 2, col = rgb(0, k, 1, .2), font = 4) abline(lm(Sepal.Width ~ Sepal.Length, data = d2), col = rgb(0, k, 1, .4), lty = 4) dev.copy(device = png, "./fig.png", width = 600, height = 1200) dev.off()
2.2 Pakiet ggplot2
ggplot2
package (https://www.springer.com/gp/book/9783319242750, https://ggplot2-book.org/)
### Czyszczenie rm(list = ls()) ### Pakiety library(dplyr) library(ggplot2) ### Dane d <- as_tibble(read.csv("http://michal.ramsza.org/lectures/2_r_programming/data/data_2.csv")) ### Pierwszy wykres / gdzie jest wykres? p1 <- ggplot(data = d, aes(x = Mileage, y = Price)) + geom_point() typeof(p1) names(p1) p1$labels p1$labels$x <- "Przebieg samochodu" print(p1) ### Bardziej zaawansowane mapowania i geomy ggplot( data = d %>% filter(Brand %in% c("Audi", "Fiat")), aes(x = Mileage, y = Price, color = Brand)) + geom_point(alpha = 0.4) ggplot( data = d %>% filter(Brand %in% c("Audi", "Fiat")), aes(x = Mileage, y = Price, color = Mileage, alpha = Price)) + geom_point() d1 <- d %>% filter(Brand %in% c("Fiat", "Audi")) ggplot(data = d1, aes(x = Mileage, y = Price, color = Brand, size = Engine_capacity)) + geom_point(alpha = 0.4) ggplot(d, aes(x = Mileage)) + geom_histogram(binwidth = 10000) ggplot(d, aes(x = Mileage)) + geom_freqpoly(binwidth = 1000) d1 <- d %>% select(Brand) %>% filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota")) %>% arrange(Brand) ggplot(d1, aes(x = Brand)) + geom_bar(color = "red", fill = "green") # liczba elementów d1 <- d %>% select(Brand, Price) %>% filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota")) %>% group_by(Brand) %>% summarise(mean_price = mean(Price)) d1 ggplot(d1, aes(x = Brand, y = mean_price)) + geom_bar(stat = "identity") # podana statystyka d1 <- d %>% select(Brand, Price) %>% filter(Brand %in% c("Audi", "Fiat", "Honda", "Toyota")) d1 ggplot(d1, aes(x = Brand, y = Price)) + geom_boxplot() ggplot(d1, aes(x = Brand, y = Price)) + geom_jitter() ggplot(d1, aes(x = Brand, y = Price)) + geom_violin() d1 <- d %>% select(Brand, Price) %>% filter(Brand %in% c("Audi", "Fiat")) d1 ### faceting ggplot(d1, aes(x = Price)) + geom_histogram(bins = 200) + facet_wrap(~ Brand, ncol = 1) ### proste zmiany w stylu d3 <- d %>% select(Year_prod, Price, Brand) %>% group_by(Year_prod, Brand) %>% summarise(meanPrice = mean(Price)) %>% arrange(Brand, Year_prod) d3 g <- ggplot(d3, aes(x = Year_prod, y = meanPrice, color = Brand)) + geom_point() + geom_line() + xlab("year of production") + ylab("average price") + xlim(1990, 2010) + ylim(NA, 10^6) + labs(title = "Average yearly prices", subtitle = "Data for used cars", caption = "Crash course on R") print(g) ### mniej proste zmiany stylu g + theme( plot.title=element_text(size=20, face="bold", family="American Typewriter", color="tomato", hjust=0.5, lineheight=1.2), # title plot.subtitle=element_text(size=15, family="American Typewriter", face="bold", hjust=0.5), # subtitle plot.caption=element_text(size=15), # caption axis.title.x=element_text(vjust=10, size=15), # X axis title axis.title.y=element_text(size=15), # Y axis title axis.text.x=element_text(size=10, angle = 30, vjust=.5), # X axis text axis.text.y=element_text(size=10)) # Y axis text
2.3 Tworzenie wykresów z mapami
Czasami trzeba stworzyć wykres, który jest mapą. Poniżej jest prosty przykład tworzenia takich wykresów bez wykorzytania zewnętrznych plików z danymi dotyczącymi mapy.
### Biblioteki library(ggplot2) library(grid) library(dplyr) library(rworldmap) ### Tworzenie danych do mapy worldMap <- getMap() ### Definiowanie nazw krajów EU (poza Luksemburg, Malta oraz Cypr) europeanUnion <- c( "Austria","Belgium","Bulgaria","Croatia", "Czech Rep.","Denmark","Estonia","Finland","France", "Germany","Greece","Hungary","Ireland","Italy","Latvia", "Lithuania","Netherlands","Poland", "Portugal","Romania","Slovakia","Slovenia","Spain", "Sweden") indEU <- which(worldMap$NAME %in% europeanUnion) ## Współrzędne geograficzne europeCoords <- lapply( X = indEU, FUN = function(i){ df <- data.frame(worldMap@polygons[[i]]@Polygons[[1]]@coords) df$region =as.character(worldMap$NAME[i]) colnames(df) <- list("long", "lat", "region") return(df) } ) europeCoords <- do.call("rbind", europeCoords) ### Nadawanie przykładowych wartości countryData <- tibble( countryName = europeanUnion, value = ifelse(rnorm(length(countryName)) < 0, "Negative", "Positive") ) europeCoords$value <- countryData$value[match(europeCoords$region,countryData$countryName)] ### Tworzenie przykładowej mapy P <- ggplot() + geom_polygon( data = europeCoords, aes(x = long, y = lat, group = region, fill = value), colour = "black", size = 0.2) + coord_map(xlim = c(-13, 35), ylim = c(32, 71)) P <- P + scale_fill_manual( name = "Country class", values = c("blue", "red", "green")) P <- P + theme( panel.grid.minor = element_line(colour = NA), panel.background = element_rect(fill = NA, colour = NA), axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), axis.title = element_blank(), rect = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) print(P)
3 Import i eksport danych
3.1 Text based formats
read.csv
,write.csv
read.table
,write.table
### Czyszczenie rm(list = ls()) ### Przykładowe dane d <- data.frame( id = 1:5, value = rnorm(5), class = sample(letters[1:2], 5, replace = TRUE)) d ### CSV write.csv(file = "./dane.csv", d) system("ls -l", intern = TRUE) system("cat ./dane.csv", intern = TRUE) read.csv(file = "./dane.csv") ### write.table write.table(file = "./dane1.data", x = d, sep = " $ ", dec = "@", row.names = FALSE, fileEncoding = "UTF-8") system("cat ./dane1.data", intern = TRUE) read.table(file = "./dane1.data", sep = "$", dec = "@", header = TRUE, fileEncoding = "UTF-8")
3.2 XLSX format
openxlsx
package
### Czyszczenie rm(list = ls()) ### Biblioteki library(openxlsx) library(dplyr) ### ----------------------------------------------------------------- ### Czytanie data1 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI", startRow = 1, colNames = TRUE, rowNames = FALSE) head(data1) data2 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx",a sheet = "CPI", startRow = 1, colNames = FALSE, rowNames = FALSE) head(data2) data3 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI", startRow = 2, colNames = FALSE, rowNames = FALSE) head(data3) data4 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI2", startRow = 1, colNames = TRUE, rowNames = TRUE) head(data4) ### Czytanie fragmentów data5 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI2", colNames = TRUE, rows = 1:5) data5 data6 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI2", colNames = TRUE, rows = 1:5, cols = 2:4) data6 data7 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI3", colNames = TRUE, startRow = 1, skipEmptyRows = TRUE) data7 data8 <- read.xlsx( xlsxFile = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheet = "CPI3", colNames = TRUE, startRow = 1, skipEmptyRows = FALSE) data8 ### Bardziej skomplikowany przykład readClientTable <- function(fileName, sheetName, xrow, xcol){ ## czytanie nazwy klienta klient <- unlist(read.xlsx(xlsxFile = fileName, sheet = sheetName, colNames = FALSE, rowNames = FALSE, rows = xrow, cols = xcol)) ## czytanie lat lata <- unlist(read.xlsx(xlsxFile = fileName, sheet = sheetName, colNames = FALSE, rowNames = FALSE, rows = xrow + 1, cols = (xcol + 1):(xcol + 5))) ## czytanie etykiet kategorie <- unlist(read.xlsx(xlsxFile = fileName, sheet = sheetName, colNames = FALSE, rowNames = FALSE, rows = (xrow + 2):(xrow + 4), cols = xcol)) ## czytanie wartości wartosci <- as.matrix(read.xlsx(xlsxFile = fileName, sheet = sheetName, colNames = FALSE, rowNames = FALSE, rows = (xrow + 2):(xrow + 4), cols = (xcol + 1):(xcol + 5))) ## tworzenie macierzy colnames(wartosci) <- lata row.names(wartosci) <- kategorie wartosci } readClientTable( fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheetName = "Data", xrow = 5, xcol = 3) readClientTable( fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheetName = "Data", xrow = 5, xcol = 11) readClientTable( fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheetName = "Data", xrow = 12, xcol = 3) readClientTable( fileName = "http://michal.ramsza.org/lectures/2_r_programming/data/data_1.xlsx", sheetName = "Data", xrow = 12, xcol = 11) ### ----------------------------------------------------------------- ### Tworzenie raportów w XLSX ### Przykład prosty data <- data.frame(label = letters) %>% mutate(x = rnorm(length(label))) %>% mutate(y = 2 * x + 1 + rnorm(length(x))) head(data) write.xlsx(x = data, file = "./data_3.xlsx") write.xlsx(x = data, file = "./data_3a.xlsx", asTable = TRUE) write.xlsx(x = data, file = "./data_4.xlsx", asTable = TRUE, title = "The X data", creator = "Michał Ramsza", sheetName = "Some data", gridLines = FALSE, tabColour = "red" ) ### ----------------------------------------------------------------- ### Zrobienie znacznie bardziej skomplikowanego raportu data <- data.frame(x = rnorm(500)) %>% mutate(y = 2 * x + 0.3 * x^2 - 1 + rnorm(500)) head(data) ### Tworzenie wirtualnego pliku wb <- createWorkbook( creator = "Michał Ramsza", title = "Example workbook", subject = "Linear regression", category = "Workshop" ) wb ### Dodawanie arkuszy addWorksheet(wb = wb, sheetName = "Linear regression", tabColour = "blue", gridLines = FALSE, header = c("Left", "&[Date]", "Right"), footer = c("Left", "Środek", "Right"), visible = TRUE) wb styl1 <- createStyle( fontName = "Calibri", fontSize = 14, fontColour = "tomato1", numFmt = "GENERAL", fgFill = "snow2", borderColour = "snow3", halign = "center", valign = "center", ) styl2 <- createStyle( fontName = "Calibri", fontSize = 11, fontColour = "black", numFmt = "NUMBER", border = "TopBottomLeftRight", borderColour = "black", halign = "center", valign = "center", ) etykiety <- names(data) dim(etykiety) <- c(1, 2) writeData(wb = wb, sheet = 1, x = etykiety, startCol = 2, startRow = 2,colNames = FALSE, rowNames = FALSE) addStyle(wb = wb, sheet = 1, style = styl1, rows = 2, cols = 2:3, gridExpand = TRUE) setRowHeights(wb = wb, sheet = 1, rows = 2, heights = 40) setColWidths(wb = wb, sheet = 1, cols = 2:3, widths = 20) writeData(wb = wb, sheet = 1, x = data, startCol = 2, startRow = 3, colNames = FALSE, rowNames = FALSE) addStyle(wb = wb, sheet = 1, style = styl2, rows = 3:502, cols = 2:3, gridExpand = TRUE) plot(data, pch = 20, cex = 2, col = rgb(0, 0, 1, 0.2), main = "Comparison of linear regression\nand polynomial smoothing") grid() abline(lm(y ~ x, data = data), col = "blue", lty = "dashed") lines(lowess(x = data$x, y = data$y, f = 1/2)) legend(x = "topleft", legend = c("linear regression", "LOWESS"), col = c("blue", "black"), lty = c("dashed", "solid"), bg = "white", y.intersp = 2, text.width = 2, bty = "o") insertPlot(wb = wb, sheet = 1, startRow = 2, startCol = 5, width = 6, height = 5, dpi = 600) dev.off() saveWorkbook(wb = wb, file = "./report.xlsx", overwrite = TRUE)
3.3 JSON
### Biblioteki library(rjson) library(dplyr) ### ----------------------------------------------------------------- ### Pakiet rjson ### Wczytywanie danych d1 <- rjson::fromJSON(file = "http://michal.ramsza.org/lectures/2_r_programming/data/data_3.json") d2 <- as_tibble(d1) d2 ### Saving to a file write( x = rjson::toJSON( x = list( id = 1:5, value = rnorm(5), class = sample(letters[1:2], size = 5, replace = TRUE))), file = "./data_test.json") ### ----------------------------------------------------------------- ### Pakiet jsonlite library(jsonlite) ### Wczytywanie danych d3 <- as_tibble(jsonlite::fromJSON(txt = "http://michal.ramsza.org/lectures/2_r_programming/data/data_3.json")) d3 ### Konwersja danych do sensowych typów ### TODO: Poprawić! toDate <- function(d) { ret <- unlist(strsplit(x = d, split = "/", fixed = TRUE)) ret <- paste( formatC(x = ret[[3]], width = 4, flag = "0"), formatC(x = ret[[1]], width = 2, flag = "0"), formatC(x = ret[[2]], width = 2, flag = "0"), sep = "-") as.POSIXct(ret) } d3a <- d3 %>% mutate(Salary = as.double(Salary)) %>% mutate(Date = toDate(StartDate)) %>% select(Dept, Name, Salary, Date) %>% arrange(Dept, desc(Salary)) d3a ### Zapisywanie do pliku write( x = jsonlite::toJSON( x = list( id = 1:5, value = rnorm(5), class = sample(letters[1:2], size = 5, replace = TRUE))), file = "./data_test_2.json")
Poniżej jest nieco bardziej skomplikowany dokumenty JSON (NDJSON).
### Pakiety library(jsonlite) library(purrr) library(dplyr) library(ggplot2) ### Typowe czytanie nie będzie działało jsonlite::fromJSON("http://michal.ramsza.org/lectures/2_r_programming/data/data_4.json") ### Czytanie linia po linii (każda linia to obiekt JSON) readNDJSON <- function(path){ ## prosty operator łączenia `%C%` <- c ## otwieranie połączenia do pliku conn <- file(description = path, open = "r") ## Wczytywanie po linii ret <- list() while(TRUE){ d <- readLines(conn, n = 1) if(length(d) == 0 ){ break } ret <- ret %C% list(fromJSON(d)) } ## zamykanie połączenia close(conn) ## zwracanie wczytanych wartości ret } ### Wczytanie pliku w formacie NDJSON test <- suppressWarnings( readNDJSON(path = "http://michal.ramsza.org/lectures/2_r_programming/data/data_4.json") ) ### Jaka jest struktura tego pliku class(test) typeof(test) length(test) names(test) test[[1]] names(test[[1]]) test[[1]]$url names(test[[1]]$result) test[[1]]$result$timestamp test[[1]]$result$sequenceNumber test[[1]]$result$pageData names(test[[1]]$result$extractorData) test[[1]]$result$extractorData$url class(test[[1]]$result$extractorData$data) unlist(test[[1]]$result$extractorData$data) ### Prosta analiza ### Upraszczanie wczytanej listy (flatten) test <- lapply(X = test, FUN = purrr::flatten) ### Funkcja do wyciągania danych extractInfo <- function(s) { temp <- s$extractorData$data[[1]][[1]][[1]] ell <- length(temp) c1 <- rep(s$url, ell) c2 <- unlist(lapply(temp, function(s){s$href})) c3 <- unlist(lapply(temp, function(s){s$text})) tibble(url = c1, href = c2, text = c3) } ### Dla każdego obiektu JSON tworzymy tibble i łączymy wierszami d <- bind_rows(lapply(test, extractInfo)) d ### Prosta analiza: długość tekstu d1 <- d %>% select(text) %>% mutate(n = nchar(text), class = substr(text, 1, 1)) d1 ggplot(d1, aes(x = class, fill = n)) + geom_bar() + scale_y_continuous(breaks = 1:10)
3.4 Bazy danych (SQLite)
R może pracować z dużą liczbą różnych baz danych. Poniżej pokazane są przykłady dla relacyjnej proste bazy danych SQLite.
### Biblioteki library(DBI) library(RSQLite) ### Przykładowe dane tab1 <- data.frame(id = 1:3, label = c("Lab 1", "Lab 2", "Lab 3")) tab2 <- data.frame(value = rnorm(20), idlab = sample(1:3, size = 20, replace = TRUE)) ### ----------------------------------------------------------------- ### Tworzenie proste bazy danych ### Łączenie z bazą driver <- dbDriver("SQLite") database <- dbConnect(drv = driver, dbname = "./database.db") ### Zapisywanie obu tabel dbWriteTable(conn = database, name = "xlabels", value = tab1) dbWriteTable(conn = database, name = "xvalues", value = tab2) ### Zamykanie połączenia dbDisconnect(conn = database) ### ----------------------------------------------------------------- ### Sprawdzanie bazy danych ### Połączenie do bazy danych database <- dbConnect(drv = driver, dbname = "./database.db") ### Lista wszystkich tabel tabels <- dbListTables(conn = database) tabels ### Pola w tabelach lapply(tabels, function(x){dbListFields(conn = database, name = x)}) ### Rozłączenie dbDisconnect(conn = database) ### ----------------------------------------------------------------- ### Czytanie z tabel ### Otwieranie połączenia database <- dbConnect(drv = driver, dbname = "./database.db") ### Lista tabel tabels <- dbListTables(conn = database) tabels ### Czytanie z tabel dbReadTable(conn = database, name = tabels[1] ) dbReadTable(conn = database, name = tabels[2] ) ### Rozłączenie dbDisconnect(conn = database)
Oprócz wykorzytania interfejsu z poziomu pakietu DBI można wykorzystać dowolny kod SQL.
### Połączenie database <- dbConnect(drv = driver, dbname = "./database.db") ### Proste zapytanie SQL query <- "SELECT xvalues.value, xlabels.label FROM xvalues LEFT JOIN xlabels ON xvalues.idlab = xlabels.id ORDER BY xlabels.label ASC" ### Wykorzystanie zapytania res <- dbGetQuery(conn = database, statement = query) res ### Rozłączenie dbDisconnect(conn = database) ### ----------------------------------------------------------------- ### Przykładowe dane data <- data.frame(value = rnorm(10), idlab = sample(1:3, 10, replace = TRUE) ) data ### Połączenie database <- dbConnect(drv = driver, dbname = "./database.db") ### Wstawianie danych `%G%` <- paste0 for(k in 1:dim(data)[1]){ query <- "INSERT INTO xvalues (value,idlab) values (" %G% as.character(data$value[k]) %G% "," %G% as.character(data$idlab[k]) %G% ")" dbExecute(conn = database, statement = query) } ### Rozłączenie dbDisconnect(conn = database)
3.5 Automatyka raportowania stargazer
### Pakiety library(stargazer) library(dplyr) library(ggplot2) ### Przykładowe dane data(cars) head(cars) ?stargazer ### Exporting data to LaTeX table stargazer(cars[1:5,], out = "./latex/tab1.tex", summary = FALSE, rownames = FALSE, title = "First few rows of the dataframe used in the model.", align = TRUE, font.size = "small", no.space = TRUE, label = "tab:1") ### Basic model m <- lm(dist ~ -1 + I(speed^2), data = cars) m1 <- lm(dist ~ +1 + I(speed^2), data = cars) ### Predictions ndata <- tibble(speed = seq(from = min(cars$speed), to = max(cars$speed), length.out = 50)) pred <- as_tibble(predict(object = m, newdata = ndata, interval = "prediction")) pred ### All data d <- as_tibble(cars %>% mutate(speedNew = ndata$speed)) d <- bind_cols(d, pred) d ### Creating a nice image ggplot(d, aes(x = speedNew, y = fit)) + geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "gray") + geom_line() + geom_point(aes(x = speed, y = dist), color = "blue", pch = 4) ### Saving an image ggsave(filename = "./latex/fig1.pdf") ### Creating a table for exporting stargazer(m, m1, out = "./latex/tab2.tex", type = "latex", title = "Regression results", label = "tab:res", font.size = "small", no.space = TRUE, align = TRUE, dep.var.labels = "Braking distance", covariate.labels = c("Speed", "Constant") )
3.6 Eurostat API
Jako przykład wykorzytania API wykorzystami API Eurostatu do stworzenia proste analizy.
### Libraries library(eurostat) library(dplyr) library(ggplot2) ### Get a list of all tables toc <- get_eurostat_toc() toc toc %>% select(title, code) ### Let's look for GDP res1 <- search_eurostat("GDP and main components") res1 res1 %>% select(title, code) ### Downloading the data d <- get_eurostat(id = "nama_10_gdp") str(d) d1 <- d %>% filter(na_item == "B1G", geo == "PL", unit == "CLV05_MEUR") %>% arrange(time) %>% mutate(year = as.integer(substr(time, 1, 4))) %>% select(year, values) d1 print(d1, n = Inf) ### Ploting the data ggplot(d1, aes(x = year, y = values)) + geom_point() + geom_line() ### Fitting a model library(forecast) d1Test <- d1 %>% filter(year < 2020) m <- ets(y = d1Test$values, model = "ZZZ") dp <- tibble(year = d1Test$year, value = m$fitted) dp ### Tworzenie prognozy last <- function(d) {d[length(d)]} fc <- tibble( year = as.integer(last(d1Test$year) + 1), value = as.vector(forecast(object = m, h = 1)$mean)) d2 <- tibble( year = c(dp$year, fc$year), value = c(dp$value, fc$value) ) d2 ### Making another chart with fitted values ggplot(d1, aes(x = year, y = values)) + geom_point() + geom_line() + geom_point(data = d2, aes(x = year, y = value), color = "red") + geom_line(data = d2, aes(x = year, y = value), color = "red")