Loading Libraries & Data

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(digest)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(RMySQL)
## Loading required package: DBI
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(nls.multstart)
library(xtable)
library(minpack.lm)
library(broom)
data <- fread("/Users/tom/data_sample.csv")
n_data <- nrow(data)

Data Cleaning

action_counts <- data %>%
  filter(type == "action") %>%
  group_by(idSite, deviceType) %>%
  summarise(num_actions = n(), .groups = 'drop')

pivot_table <- action_counts %>%
  pivot_wider(names_from = idSite, values_from = num_actions, values_fill = 0)

# Print the pivot table
print(pivot_table)
## # A tibble: 9 × 4
##   deviceType   `1`   `2`    `5`
##   <chr>      <int> <int>  <int>
## 1 Console        1     0      0
## 2 Desktop    20103 16288 170443
## 3 Phablet      453   302   1328
## 4 Smartphone 16165 28217  41890
## 5 Tablet       735  1749   1346
## 6 Tv             3     0      3
## 7 Unknown       45    38    161
## 8 Peripheral     0     1      2
## 9 Wearable       0     0      2

We will remove the obscure device types and count phablets as smartphones.

data_cleaned <- data %>%
  mutate(deviceType = if_else(deviceType == "Phablet", "Smartphone", deviceType)) %>%
  filter(deviceType == "Desktop" | deviceType == "Tablet" | deviceType == "Smartphone")
  

n_data_cleaned <- nrow(data_cleaned)
print(paste(n_data_cleaned / n_data * 100, "% left after clearning", sep = " "))
## [1] "99.9443304297338 % left after clearning"
data_cleaned <- data_cleaned %>%
  mutate(eventValue = if_else(type == "action", 0, eventValue)) 




visits_with_only_events <- data_cleaned %>%
  group_by(idVisit) %>%
  summarise(only_event = all(type == "event")) %>%
  filter(only_event) %>%
  pull(idVisit)

# Remove visits without a pageview event from the data
data_cleaned <- data_cleaned %>%
  filter(!(idVisit %in% visits_with_only_events))

data_cleaned <- data_cleaned %>%
  group_by(idVisit, idpageview) %>%
  arrange(timestamp) %>%
  mutate(time_difference = as.numeric(timestamp) - min(as.numeric(timestamp))) %>%
  ungroup()

# data_cleaned <- data_cleaned %>% mutate(pixel_depth = eventValue * heightInPixels / 100)

action_data <- data_cleaned %>% filter(type == "action")

# Extract width and height from resolution
action_data <- action_data %>%
  mutate(width = as.numeric(str_extract(resolution, "^[0-9]+")),
         height = as.numeric(str_extract(resolution, "[0-9]+$")))

# Remove NA and zero values
action_data <- action_data %>%
  filter(!is.na(width), !is.na(height), width > 0, height > 0)

# Calculate mean and median width and height for each device type
summary <- action_data %>%
  group_by(deviceType) %>%
  summarise(
    mean_width = mean(width),
    median_width = median(width),
    mean_height = mean(height),
    median_height = median(height)
  )

# For Desktop
desktop_summary <- summary %>% 
  filter(deviceType == "Desktop") %>% 
  select(median_width, median_height)

# Extract values
median_width_desktop <- desktop_summary$median_width
median_height_desktop <- desktop_summary$median_height

# For Smartphone
smartphone_summary <- summary %>% 
  filter(deviceType == "Smartphone") %>% 
  select(median_width, median_height)

# Extract values
median_width_smartphone <- smartphone_summary$median_width
median_height_smartphone <- smartphone_summary$median_height

# For Tablet
tablet_summary <- summary %>% 
  filter(deviceType == "Tablet") %>% 
  select(median_width, median_height)

# Extract values
median_width_tablet <- tablet_summary$median_width
median_height_tablet <- tablet_summary$median_height


# Update the data with median dimensions
data_cleaned <- data_cleaned %>% 
  mutate(estimated_browser_height = case_when(
    deviceType == "Desktop" ~ median_height_desktop * 0.95,
    deviceType == "Smartphone" ~ median_height_smartphone * 0.90,
    deviceType == "Tablet" ~ median_height_tablet * 0.90
  ))

# Calculate the ratio of the virtual browser height (800 pixels) to the estimated browser height
virtual_browser_height <- 800
data_cleaned <- data_cleaned %>% 
  mutate(page_length_ratio = virtual_browser_height / estimated_browser_height,
         adjusted_page_height = heightInPixels * page_length_ratio)

# Calculate pixel depth
data_cleaned <- data_cleaned %>% mutate(pixel_depth = eventValue * adjusted_page_height / 100)


fwrite(data_cleaned, "/Users/tom/Documents/10 Data/DissData/4/data_cleaned.csv")
#data_cleaned <- fread("/Users/tom/Documents/10 Data/DissData/4/data_cleaned.csv")
n_data_cleaned <- nrow(data_cleaned)
print(paste(n_data_cleaned / n_data * 100, "% left after clearning", sep = " "))
## [1] "97.4137352393833 % left after clearning"
data_cleaned %>%
  select(url) %>%
  unique()
## # A tibble: 2,282 × 1
##    url                             
##    <chr>                           
##  1 755fc3c38f072fb7d16c968a87d89475
##  2 ffe1cf4d42063bdffedad446f83c550a
##  3 463603f785e6f9647e0111981881564c
##  4 c73945635bbe64182efd4af6e016769e
##  5 9048b848afe2ba9fd2dad1b644c636b2
##  6 2fc5fb3711191ed1b4af211482eefffc
##  7 17121ae452c2599320d02275e2ddc5d8
##  8 70d8b9378310128282cc7f17ecd8dc92
##  9 71e5cee12ef2ed6cb85c83f736bb71a2
## 10 b5c39d4580d482c603ab9a5acf34e01c
## # ℹ 2,272 more rows

Some Analysis

action_data %>%
  filter(height < 3000) %>%
  ggplot(., aes(x = height)) +
  geom_histogram() +
  facet_wrap(vars(deviceType))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Filter data to include only rows with type='action'
action_data <- data_cleaned %>% filter(type == "action")

# Group by deviceType and resolution, and count the occurrences
resolution_count <- action_data %>%
  group_by(deviceType, resolution) %>%
  summarise(count = n(), .groups = 'drop') %>%
  arrange(deviceType, desc(count))

# Get the top 5 resolutions for each device type
top_resolutions <- resolution_count %>%
  group_by(deviceType) %>%
  slice_max(order_by = count, n = 5) %>%
  ungroup()

rm(resolution_count)

# Spread the data to create a pivot table
pivot_table <- top_resolutions %>%
  pivot_wider(names_from = deviceType, values_from = count, values_fill = 0)
rm(top_resolutions)

# Print the pivot table
print(pivot_table)
## # A tibble: 15 × 4
##    resolution Desktop Smartphone Tablet
##    <chr>        <int>      <int>  <int>
##  1 1920x1080    48773          0      0
##  2 1536x864     32100          0      0
##  3 1366x768     23646          0      0
##  4 1280x720     22143          0      0
##  5 1440x900     14241          0      0
##  6 390x844          0      13144      0
##  7 414x896          0       7972      0
##  8 360x800          0       7781      0
##  9 375x812          0       6720      0
## 10 375x667          0       4492      0
## 11 768x1024         0          0    529
## 12 800x1280         0          0    410
## 13 1280x800         0          0    340
## 14 1024x1366        0          0    260
## 15 810x1080         0          0    240