---
title: "BUA - HW 5 - Part 1"
format: dashboard
---

```{r setup, warning=F, message=F, echo=F, include=F}
# this line specifies options for default options for all R Chunks
knitr::opts_chunk$set(echo=T)

# suppress scientific notation
options(scipen=100)

# install helper package that loads and installs other packages, if needed
if (!require("pacman")) install.packages("pacman", repos = "http://lib.stat.cmu.edu/R/CRAN/")

# install and load required packages
# pacman should be first package in parentheses and then list others
pacman::p_load(pacman,tidyverse, magrittr, tidyquant, ggthemes, 
               RColorBrewer, highcharter, kableExtra)

# verify packages
#p_loaded()


```

```{r  data import, warning=F, message=F, echo=F, include=F}
# import data and do prelim cleaning
nflx <- read_csv("data/netflix_titles.csv", show_col_types=F) |>
  select(show_id, type, date_added, release_year, rating, listed_in) |> # useful variables (subjective)
  separate(listed_in, sep=", ", into=c("g1", "g2", "g3")) |>         # separate out genre listings 
                                                                    # will result in lots of NAs
  mutate(date_added = mdy(date_added),                              # convert data_added to a date var
         year_added = year(date_added)) |>       
  filter(!is.na(year_added)) |>
  filter(!rating %in% c("NR", "UR", "66 min", "74 min", "84 min"))

```

```{r simplify ratings and release year data, warning=F, message=F, echo=F, include=F}
# create min_age variable from ratings data
nflx <- nflx |>
  mutate(min_age = ifelse(rating %in% c("G", "TV-G", "TV-Y"), 0, NA),
         min_age = ifelse(rating %in% c("PG", "TV-PG", "TV-Y7", "TV-Y7-FV"), 7, min_age),
         min_age = ifelse(rating %in% c("PG-13", "TV-14"), 13, min_age),
         min_age = ifelse(rating %in% c("NC-17", "TV-MA", "R"), 17, min_age))

# create release_period variable from release year
nflx <- nflx |>
  mutate(release_period = ifelse(release_year <= 1980, "1925-1980", NA),
         release_period = ifelse(release_year > 1980 & release_year <= 2000, "1981-2000", release_period),
         release_period = ifelse(release_year > 2000 & release_year <= 2005, "2001-2005", release_period),
         release_period = ifelse(release_year > 2005 & release_year <= 2010, "2006-2010", release_period),
         release_period = ifelse(release_year > 2010 & release_year <= 2015, "2011-2015", release_period),
         release_period = ifelse(release_year > 2015, "2016-2021", release_period))

```

```{r genre data cleaning, warning=F, message=F, echo=F, include=F}
# iterative cleaning, simplifying and filtering of genre information
# helpful to create new dataset for this step
nflx <- nflx |>
  select(show_id, type, year_added, release_period, min_age, g1, g2, g3) |>
  pivot_longer(cols=c("g1","g2","g3"), names_to="g", values_to="genre") |>
  select(!g) |>
  mutate(genre=tolower(genre),
         genre = str_trim(genre),
         genre = gsub(" movies", "", genre),
         genre = gsub("movies", "", genre),
         genre = gsub(" tv shows", "", genre),
         genre = gsub("tv shows", "", genre),
         genre = gsub("tv ", "", genre),
         genre = gsub(" tv", "", genre),
         
         genre = ifelse(genre == "drama", "dramas", genre),
         
         genre = ifelse(genre %in% c("stand-up comedy", 
                                     "stand-up comedy & talk shows"), 
                        "comedies", genre),

         genre = ifelse(genre %in% c("documentaries", "docuseries"), 
                        "docs", genre),
         
         genre = ifelse(genre %in% c("children & family", "kids'"), 
                        "kids", genre),

         genre = gsub("action & adventure", "action_adventr", genre))|>

  filter(genre %in% c("action_adventr", "comedies", "docs", 
                      "dramas", "international", "kids"))

# table(nflx$type, nflx$genre)

```

```{r summarizing dataset by relevant variables, warning=F, message=F, echo=F, include=F}
nflx_wide <- nflx |>
  group_by(type, release_period, year_added, min_age, genre) |>
  summarize(num = n()) |>
  pivot_wider(id_cols=c(type, release_period, year_added, min_age), 
              names_from = genre, values_from = num)

nflx_wide[is.na(nflx_wide)] <- 0

# example dashboard will be created using cleaned and managed TV data
nflx_tv <- nflx_wide |>
  filter(type == "TV Show") 

# for hw 5 you will create a dataset of just movies:
nflx_mv <- nflx_wide |>
  filter(type == "Movie")

```

# Nextflix and AMC Stock Values

```{r import stock data, warning=F, message=F, echo=F, include=F}

getSymbols("NFLX", from = Sys.Date()-7, to = Sys.Date())
NFLX_mr <- NFLX |> fortify.zoo() |> as_tibble(.name_repair = "minimal") |>
  rename("date" = "Index") |>
  filter(date==max(date))

getSymbols("AMC", from = Sys.Date()-7, to = Sys.Date())
AMC_mr <- AMC |> fortify.zoo() |> as_tibble(.name_repair = "minimal") |>
  rename("date" = "Index") |>
  filter(date==max(date))


getSymbols("AMC", from = "2013-01-01", to = "2022-12-31")
getSymbols("NFLX", from = "2013-01-01", to = "2022-12-31")

```

## Row

Value boxes show the present day values of Netflix and AMC
stocks. The interactive Highchart plots show the trend over
time for the range of data examined in the Netflix plots on
the following pages. Stock data was downloaded from [Yahoo
Finance](https://finance.yahoo.com/).

## Row {height="20%"}

```{r}
#| content: valuebox
#| title: "Last Update"
list(
  color = "green",
  value = stamp("Sat. Jan. 1, 1999", quiet = T)(NFLX_mr$date)
)

```

```{r}
#| content: valuebox
#| title: "Netflix Adjusted Close"

list(
  color = "red",
  value = NFLX_mr$NFLX.Adjusted |> round(2)
)
```

```{r}
#| content: valuebox
#| title: "AMC Adjusted Close"

list(
  color = "blue",
  value = AMC_mr$AMC.Adjusted |> round(2)
)

```

## Row {height="75%"}

```{r  pg1 nflx stock trends, warning=F, message=F, echo=F}

# create nflx and amzn plots of adjusted, high and low for this time frame
(NFLX_fts <- hchart(NFLX$NFLX.Adjusted, name="NFLX Adj.", color="red") |>
  hc_add_series(NFLX$NFLX.High, name="NFLX Hi." , color="red4") |>
  hc_add_series(NFLX$NFLX.Low, name="NFLX Lo." , color="lightcoral"))

```

```{r pg1 amc stock trends, warning=F, message=F, echo=F}

(AMC_fts <- hchart(AMC$AMC.Adjusted, name="AMC Adj.", color="blue") |>
  hc_add_series(AMC$AMC.High, name="AMC Hi." , color="darkblue") |>
  hc_add_series(AMC$AMC.Low, name="AMC Lo." , color="lightblue"))


```

## Row {height="5%"}

The trend in the two plots appear similar but the y-axis
axis scale differs.

This plot is an alternative to the [High-Low Candlestick
plot](https://jkunst.com/highcharter/articles/stock.html)
which can also be created as a highchart.

[**This link is to
Quarto**](https://quarto.org/){target="_blank"}

# Bar Chart of Movie Trends

## Row

### Column {width="70%"}

```{r pg2 nflx mv release period data mgmt, include = F}

nflx_mv_plot1 <- nflx_mv |>                                 # reshape data to long format for plot
  pivot_longer(cols=c("comedies","action_adventr","docs",
                      "dramas","international","kids"),
               names_to="genre", values_to="n") 

nflx_mv_plot1 <- nflx_mv_plot1 |>                         # simplify release period (see instructions)
  mutate(release_period = ifelse(release_period %in% c("2001-2005", "2006-2010"), 
                                 "2001-2010", release_period))


nflx_mv_plot1 <- nflx_mv_plot1 |>                          # add one time period to correct filter statement
  filter(release_period %in% c("1981-2000", "2001-2010", "2011-2015", "2016-2021"))


nflx_mv_plot1 <- nflx_mv_plot1 |>                        # create factor variable min_ageF from min_age
  mutate(min_ageF =  factor(min_age, levels = c(0, 7, 13, 17)))



nflx_mv_plot1 <- nflx_mv_plot1 |>                           # create genre factor variable, genreF
  mutate(genreF = factor(genre,                             # note that order must be corrected
                         levels = c("international","dramas","comedies","docs","kids","action_adventr"),
                         labels = c("Int","Dr","C","Do","K","A/A"))) 

```

```{r pg2 nflx mv release period bar chart, fig.dim = c(10, 5), echo=F}

# creates a grouped stacked bar_chart to show proportion in each min_age cat in each bar
(nflx_mv_barplot <- nflx_mv_plot1 |>
  # creates a grouped stacked bar_chart to show proportion in each age_min cat in each bar
  ggplot() +
    
  geom_bar(aes(x=genreF, y=n, fill=min_ageF),
           stat="identity", position="stack") + 
  
  theme_classic() +

# facet_grid creates a separate panel for each period
  facet_grid(~release_period) + 
  
# adjust colors all at once by changing palette
  scale_fill_brewer(palette = "Spectral") +
  
# labels axes, titles, caption, and legend
  labs(x="Genre", y="Number of Movies", fill="Min. Age",
       title="Release Time Periods of Netflix Movies by Genre",
       subtitle="I=International   D=Drama   C=Comedy   Do=Documentaries   K=Kids   A/A=Action/Adventure",
       caption="Data Source: https://www.kaggle.com/shivamb/netflix-shows"))

```

### Column {width="30%"}

#### Row

Number of Netflix Movies from each Genre and Release Period.

#### Row

```{r pg2 nflx summary table, echo=F, message=F}

nflx_smry1 <- nflx_mv_plot1 |>
  select(release_period, genreF, n) |>      # select variables
  group_by(release_period, genreF) |>       # group and summarize data
  summarize(n=sum(n)) |>
  pivot_wider(id_cols = release_period, names_from=genreF, values_from=n) |>  # reshape data for table output
  rename("Release Time Period" = "release_period") 

kable(nflx_smry1)                                     # print out presentation table
```

#### Row

Genres are shown in plot and table in order of prevalence.

Information on [grouped and stacked bar
charts](https://www.r-graph-gallery.com/stacked-barplot.html)

Information on plots with
[facets](https://ggplot2.tidyverse.org/reference/facet_grid.html)

Bonus link for [multi-panel
plots](http://www.sthda.com/english/wiki/ggplot2-facet-split-a-plot-into-a-matrix-of-panels)

A link to more information about [Quarto
Dashboards](https://quarto.org/docs/gallery/#dashboards){target="_blank"}

# Netflix Movie Added Each Year

## Row

### Column {width="75%"}

```{r pg3 nflx mv area plot data mgmt, include=F}

nflx_mv_plot2 <- nflx_mv |>                           # start with nflx_tv
  rowwise() |>
  mutate(total = sum(c_across(comedies:kids))) |>     # sum all genres rowwise (columns 5 through 10)  
  ungroup() |>

  select(year_added, min_age, total) |>               # keep only the 3 columns needed for plot
  filter(year_added >= 2013) |>

  group_by(year_added, min_age) |>                    # summarize by year and age_min category
  summarize(total=sum(total, na_rm=T)) |>             
  mutate(min_ageF = factor(min_age, levels=c(0,7,13,17))) 

```

```{r pg3 nflx mv area plot, fig.dim = c(10, 6), echo=F}

# area (proportion) plot code begins here
# plot code is incomplete (see instructions)
(nflx_mv_area_plot <- nflx_mv_plot2 |>
    
  ggplot() +                                 # shows stacked areas (proportions) attributed to each category
  geom_area(aes(x=year_added, y=total, fill=min_ageF)) +                              
   theme_classic() +
  theme(legend.position = "bottom") +        # move legend to bottom
  scale_x_continuous(breaks=seq(2013, 2021, 1)) +   # modifies x axis so each year is shown  (See HW 4)
  scale_fill_brewer(palette = "Spectral") +  # modifies color palette
  labs(x="Year", y="Number of Movies", fill="Min. Age",   # format and add plot labels
       title="Number of Netflix Movies Added Each Year",
       subtitle="2013 - 2021",
       caption="Data Source: https://www.kaggle.com/shivamb/netflix-shows"))

```

### Column {width="25%"}

#### Row

This plot does not include genre information (shown on Page
2).

Further analyses would benefit from having data that
differentiates between Netflix original content and Netflix
content purchased from other sources.

#### Row

Information on [Area
plots](https://www.r-graph-gallery.com/136-stacked-area-chart)

Information on [R Color
Options](https://www.r-graph-gallery.com/38-rcolorbrewers-palettes.html)

# About

This dashboard was created using
[Quarto](https://quarto.org/) in
[RStudio](https://posit.co/), and the [R Language and
Environment](https://cran.r-project.org/).

The dataset used to create this dashboard was downloaded
from [Yahoo Finance](https://finance.yahoo.com/) and
[Kaggle](https://www.kaggle.com/)

## Row

**Software Citations**

## Row

**Software Citations**

Arnold J (2024). *ggthemes: Extra Themes, Scales and Geoms for 'ggplot2'*. R package version 5.1.0, https://github.com/jrnold/ggthemes, <https://jrnold.github.io/ggthemes/>.

Bache S, Wickham H (2025). _magrittr: A Forward-Pipe Operator for R_. doi:10.32614/CRAN.package.magrittr <https://doi.org/10.32614/CRAN.package.magrittr>, R package version 2.0.4, <https://CRAN.R-project.org/package=magrittr>.

Dancho M, Vaughan D (2025). _tidyquant: Tidy Quantitative Financial Analysis_. doi:10.32614/CRAN.package.tidyquant <https://doi.org/10.32614/CRAN.package.tidyquant>, R package version 1.0.11, <https://CRAN.R-project.org/package=tidyquant>.

Kunst J (2022). *highcharter: A Wrapper for the 'Highcharts' Library*. R package version 0.9.4, <https://CRAN.R-project.org/package=highcharter>.

Neuwirth E (2022). *RColorBrewer: ColorBrewer Palettes*. R package version 1.1-3, <https://CRAN.R-project.org/package=RColorBrewer>.

Posit team (2026). _RStudio: Integrated Development Environment for R_ (Version 2026.1.1.403). Posit Software, PBC, Boston, MA. <http://www.posit.co/>.

Quarto Development Team. (2026). Quarto Publishing System (Version 1.8.27). <https://quarto.org>.

R Core Team (2025). _R: A Language and Environment for Statistical Computing_ (Version 4.5.2). R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>.

Rinker, T. W. & Kurkiewicz, D. (2017). pacman: Package Management for R. version 0.5.0. Buffalo, New York. <http://github.com/trinker/pacman>.

Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). “Welcome to the tidyverse.” *Journal of Open Source Software*, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.

Xie Y (2025). _knitr: A General-Purpose Package for Dynamic Report Generation in R_. R package version 1.50, <https://yihui.org/knitr/>.

Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition. Chapman and Hall/CRC. ISBN 978-1498716963

Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible Research in R. In Victoria Stodden, Friedrich Leisch and Roger D. Peng, editors, Implementing Reproducible Computational Research. Chapman and Hall/CRC. ISBN 978-1466561595

Zhu H (2024). *kableExtra: Construct Complex Table with 'kable' and Pipe Syntax*. R package version 1.4.0, https://github.com/haozhu233/kableExtra, <http://haozhu233.github.io/kableExtra/>.
