library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(digest)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(RMySQL)
## Loading required package: DBI
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(nls.multstart)
library(xtable)
library(minpack.lm)
library(broom)
data <- fread("/Users/tom/data_sample.csv")
n_data <- nrow(data)
action_counts <- data %>%
filter(type == "action") %>%
group_by(idSite, deviceType) %>%
summarise(num_actions = n(), .groups = 'drop')
pivot_table <- action_counts %>%
pivot_wider(names_from = idSite, values_from = num_actions, values_fill = 0)
# Print the pivot table
print(pivot_table)
## # A tibble: 9 × 4
## deviceType `1` `2` `5`
## <chr> <int> <int> <int>
## 1 Console 1 0 0
## 2 Desktop 20103 16288 170443
## 3 Phablet 453 302 1328
## 4 Smartphone 16165 28217 41890
## 5 Tablet 735 1749 1346
## 6 Tv 3 0 3
## 7 Unknown 45 38 161
## 8 Peripheral 0 1 2
## 9 Wearable 0 0 2
We will remove the obscure device types and count phablets as smartphones.
data_cleaned <- data %>%
mutate(deviceType = if_else(deviceType == "Phablet", "Smartphone", deviceType)) %>%
filter(deviceType == "Desktop" | deviceType == "Tablet" | deviceType == "Smartphone")
n_data_cleaned <- nrow(data_cleaned)
print(paste(n_data_cleaned / n_data * 100, "% left after clearning", sep = " "))
## [1] "99.9443304297338 % left after clearning"
data_cleaned <- data_cleaned %>%
mutate(eventValue = if_else(type == "action", 0, eventValue))
visits_with_only_events <- data_cleaned %>%
group_by(idVisit) %>%
summarise(only_event = all(type == "event")) %>%
filter(only_event) %>%
pull(idVisit)
# Remove visits without a pageview event from the data
data_cleaned <- data_cleaned %>%
filter(!(idVisit %in% visits_with_only_events))
data_cleaned <- data_cleaned %>%
group_by(idVisit, idpageview) %>%
arrange(timestamp) %>%
mutate(time_difference = as.numeric(timestamp) - min(as.numeric(timestamp))) %>%
ungroup()
# data_cleaned <- data_cleaned %>% mutate(pixel_depth = eventValue * heightInPixels / 100)
action_data <- data_cleaned %>% filter(type == "action")
# Extract width and height from resolution
action_data <- action_data %>%
mutate(width = as.numeric(str_extract(resolution, "^[0-9]+")),
height = as.numeric(str_extract(resolution, "[0-9]+$")))
# Remove NA and zero values
action_data <- action_data %>%
filter(!is.na(width), !is.na(height), width > 0, height > 0)
# Calculate mean and median width and height for each device type
summary <- action_data %>%
group_by(deviceType) %>%
summarise(
mean_width = mean(width),
median_width = median(width),
mean_height = mean(height),
median_height = median(height)
)
# For Desktop
desktop_summary <- summary %>%
filter(deviceType == "Desktop") %>%
select(median_width, median_height)
# Extract values
median_width_desktop <- desktop_summary$median_width
median_height_desktop <- desktop_summary$median_height
# For Smartphone
smartphone_summary <- summary %>%
filter(deviceType == "Smartphone") %>%
select(median_width, median_height)
# Extract values
median_width_smartphone <- smartphone_summary$median_width
median_height_smartphone <- smartphone_summary$median_height
# For Tablet
tablet_summary <- summary %>%
filter(deviceType == "Tablet") %>%
select(median_width, median_height)
# Extract values
median_width_tablet <- tablet_summary$median_width
median_height_tablet <- tablet_summary$median_height
# Update the data with median dimensions
data_cleaned <- data_cleaned %>%
mutate(estimated_browser_height = case_when(
deviceType == "Desktop" ~ median_height_desktop * 0.95,
deviceType == "Smartphone" ~ median_height_smartphone * 0.90,
deviceType == "Tablet" ~ median_height_tablet * 0.90
))
# Calculate the ratio of the virtual browser height (800 pixels) to the estimated browser height
virtual_browser_height <- 800
data_cleaned <- data_cleaned %>%
mutate(page_length_ratio = virtual_browser_height / estimated_browser_height,
adjusted_page_height = heightInPixels * page_length_ratio)
# Calculate pixel depth
data_cleaned <- data_cleaned %>% mutate(pixel_depth = eventValue * adjusted_page_height / 100)
fwrite(data_cleaned, "/Users/tom/Documents/10 Data/DissData/4/data_cleaned.csv")
#data_cleaned <- fread("/Users/tom/Documents/10 Data/DissData/4/data_cleaned.csv")
n_data_cleaned <- nrow(data_cleaned)
print(paste(n_data_cleaned / n_data * 100, "% left after clearning", sep = " "))
## [1] "97.4137352393833 % left after clearning"
data_cleaned %>%
select(url) %>%
unique()
## # A tibble: 2,282 × 1
## url
## <chr>
## 1 755fc3c38f072fb7d16c968a87d89475
## 2 ffe1cf4d42063bdffedad446f83c550a
## 3 463603f785e6f9647e0111981881564c
## 4 c73945635bbe64182efd4af6e016769e
## 5 9048b848afe2ba9fd2dad1b644c636b2
## 6 2fc5fb3711191ed1b4af211482eefffc
## 7 17121ae452c2599320d02275e2ddc5d8
## 8 70d8b9378310128282cc7f17ecd8dc92
## 9 71e5cee12ef2ed6cb85c83f736bb71a2
## 10 b5c39d4580d482c603ab9a5acf34e01c
## # ℹ 2,272 more rows
action_data %>%
filter(height < 3000) %>%
ggplot(., aes(x = height)) +
geom_histogram() +
facet_wrap(vars(deviceType))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Filter data to include only rows with type='action'
action_data <- data_cleaned %>% filter(type == "action")
# Group by deviceType and resolution, and count the occurrences
resolution_count <- action_data %>%
group_by(deviceType, resolution) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(deviceType, desc(count))
# Get the top 5 resolutions for each device type
top_resolutions <- resolution_count %>%
group_by(deviceType) %>%
slice_max(order_by = count, n = 5) %>%
ungroup()
rm(resolution_count)
# Spread the data to create a pivot table
pivot_table <- top_resolutions %>%
pivot_wider(names_from = deviceType, values_from = count, values_fill = 0)
rm(top_resolutions)
# Print the pivot table
print(pivot_table)
## # A tibble: 15 × 4
## resolution Desktop Smartphone Tablet
## <chr> <int> <int> <int>
## 1 1920x1080 48773 0 0
## 2 1536x864 32100 0 0
## 3 1366x768 23646 0 0
## 4 1280x720 22143 0 0
## 5 1440x900 14241 0 0
## 6 390x844 0 13144 0
## 7 414x896 0 7972 0
## 8 360x800 0 7781 0
## 9 375x812 0 6720 0
## 10 375x667 0 4492 0
## 11 768x1024 0 0 529
## 12 800x1280 0 0 410
## 13 1280x800 0 0 340
## 14 1024x1366 0 0 260
## 15 810x1080 0 0 240