CSX_R homework 5

my site: https://b04902122.github.io/CSX_R/week_5/tf.html

libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(janeaustenr)
library(tidytext)

## Warning: package 'tidytext' was built under R version 3.4.4

library(ggplot2)

Calculate total words in all novels

(“Sense and Sensibility”, “Pride and Prejudice”, “Mansfield Park”, “Emma”, “Northanger Abbey”, and “Persuasion”.)

book_words <- austen_books() %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE) %>%
  ungroup()

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)

## Joining, by = "book"

book_words

## # A tibble: 40,379 x 4
##    book              word      n  total
##    <fct>             <chr> <int>  <int>
##  1 Mansfield Park    the    6206 160460
##  2 Mansfield Park    to     5475 160460
##  3 Mansfield Park    and    5438 160460
##  4 Emma              to     5239 160996
##  5 Emma              the    5201 160996
##  6 Emma              and    4896 160996
##  7 Mansfield Park    of     4778 160460
##  8 Pride & Prejudice the    4331 122204
##  9 Emma              of     4291 160996
## 10 Pride & Prejudice to     4162 122204
## # ... with 40,369 more rows

Term Frequency Distribution for all 6 novels

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 896 rows containing non-finite values (stat_bin).

Highest tf-idf words for all 6 novels

book_words <- book_words %>%
  bind_tf_idf(word, book, n)

book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

CSX_R homework 5

Ymc

2018/04/10

libraries

Calculate total words in all novels

(“Sense and Sensibility”, “Pride and Prejudice”, “Mansfield Park”, “Emma”, “Northanger Abbey”, and “Persuasion”.)

Term Frequency Distribution for all 6 novels

Highest tf-idf words for all 6 novels