En esta sección mostramos la descarga de guiones de películas que viene clasificadas por género. La fuente es http://www.imsdb.com/.
library(RCurl)
library(gsubfn)
library(tictoc)
library(tidyverse)
En imsdb se encuentran los guiones clasificados por su género. Usando http://www.imsdb.com/feeds/genre.php?genre=GENERO
con GENERO como:
obtenemos el listado -por género de películas- de ligas para descargar los guiones.
url_list <- list()
u <- "http://www.imsdb.com/feeds/genre.php?genre=Action"
action <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["action"]] <- action[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Adventure"
adventure <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["adventure"]] <- adventure[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Animation"
animation <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["animation"]] <- animation[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Comedy"
comedy <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["comedy"]] <- comedy[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Crime"
crime <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["crime"]] <- crime[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Drama"
drama <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["drama"]] <- drama[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Family"
family <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["family"]] <- family[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Fantasy"
fantasy <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["fantasy"]] <- fantasy[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Film-Noir"
film_noir <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["film_noir"]] <- film_noir[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Horror"
horror <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["horror"]] <- horror[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Musical"
musical <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["musical"]] <- musical[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Mystery"
mystery <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["mystery"]] <- mystery[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Romance"
romance <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["romance"]] <- romance[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Sci-Fi"
scifi <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["scifi"]] <- scifi[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Short"
short <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["short"]] <- short[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Thriller"
thriller <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["thriller"]] <- thriller[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=War"
war <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["war"]] <- war[-1]
u <- "http://www.imsdb.com/feeds/genre.php?genre=Western"
western <- strapplyc(getURL(u), "<link>(.*?)</link>", simplify = c)
url_list[["western"]] <- western[-1]
lapply(url_list, length)
## $action
## [1] 509
##
## $adventure
## [1] 385
##
## $animation
## [1] 257
##
## $comedy
## [1] 776
##
## $crime
## [1] 224
##
## $drama
## [1] 639
##
## $family
## [1] 51
##
## $fantasy
## [1] 128
##
## $film_noir
## [1] 4
##
## $horror
## [1] 152
##
## $musical
## [1] 26
##
## $mystery
## [1] 131
##
## $romance
## [1] 206
##
## $scifi
## [1] 354
##
## $short
## [1] 3
##
## $thriller
## [1] 411
##
## $war
## [1] 31
##
## $western
## [1] 17
scripts_all.RDS
url <- "http://www.imsdb.com/scripts/Nightmare-Before-Christmas,-The.html"
doc.raw <- getURL(url)
write_lines(doc.raw, "./datos/Nightmare-Before-Christmas,-The.html")
tail -n +300 ./datos/Nightmare-Before-Christmas,-The.html | head -n 20
## </b>I am the "who" when you call, "Who's there?"
## I am the wind blowing through your hair
##
## <b>OOGIE BOOGIE SHADOW
## </b>I am the shadow on the moon at night
## Filling your dreams to the brim with fright
##
## <b>CORPSE CHORUS
## </b>This is Halloween, this is Halloween
## Halloween! Halloween! Halloween! Halloween!
## Halloween! Halloween!
##
## <b>CHILD CORPSE TRIO
## </b>Tender lumplings everywhere
## Life's no fun without a good scare
##
## <b>PARENT CORPSES
## </b>That's our job, but we're not mean
## In our town of Halloween
##
## tail: stdout: Broken pipe
La función para hacer esto es:
script_vec <- function(url) {
doc.raw <- getURL(url)
doc_vec <- strapplyc(doc.raw, "</b>(.*?)<b>", simplify = c)
doc_vec <- gsub("\n", "", doc_vec)
doc_vec <- gsub("\\s+", " ", doc_vec)
doc_vec
}
#tic()
if (!file.exists("./datos/scripts_all.RDS")) {
scripts <- list()
for(genre in names(url_list)) {
script_list <- lapply(url_list[[genre]], script_vec)
scripts[[genre]] <- Reduce("c", script_list)
}
write_rds(scripts, "./datos/scripts_all.RDS")
}
#toc()
scripts_all.RDS
que contiene una lista con vectores de caracteres. Cada entrada de estos vectores es una línea de película o la descripción de una escena. Por ejemplo:script_vec(url)[300:310]
## [1] "Where?"
## [2] "To Oogie boogie, of course. There isn't anywhere in the whole world more comfortable than that and Jack said to make him comfortable. Didn't he?"
## [3] "Yes he did."
## [4] "Haven't you heard of peace on earth and good will toward men?"
## [5] "No![Dr. Finkelstein's castle][getting fog juice] "
## [6] "This'll stop Jack.[working on new creation to replace Sally]"
## [7] "What a joy to think of all we'll have in common. We'll have conversations worth having.[Oogie's]"
## [8] "[laughing]"
## [9] "Don't do this. Naughty children never get any presents."
## [10] "I think he might be too big."
## [11] "No he's not. If he can go down a chimney, he can fit down here![in Oogie's lair] Oogie Boogie's Song Performed by Ken Page with Ed lvory"
scripts_less.RDS
intersect(url_list$musical, url_list$animation)
## [1] "http://www.imsdb.com/scripts/Aladdin.html"
## [2] "http://www.imsdb.com/scripts/Anastasia.html"
## [3] "http://www.imsdb.com/scripts/Lion-King,-The.html"
## [4] "http://www.imsdb.com/scripts/Little-Mermaid,-The.html"
## [5] "http://www.imsdb.com/scripts/Mary-Poppins.html"
## [6] "http://www.imsdb.com/scripts/Mulan.html"
## [7] "http://www.imsdb.com/scripts/Nightmare-Before-Christmas,-The.html"
## [8] "http://www.imsdb.com/scripts/South-Park.html"
url_list$film_noir <- NULL
url_list$short <- NULL
url_list$comedy <- url_list$comedy[!url_list$comedy %in% url_list$family]
url_list$animation <- c(url_list$animation, url_list$family)
url_list$animation <- c(url_list$animation, url_list$musical)
url_list$family <- NULL
url_list$musical <- NULL
url_list$action <- c(url_list$action, url_list$western)
url_list$western <- NULL
#Filter(function(x) !x %in% c(url_list$adventure, url_list$action, url_list$drama), url_list$war)
url_list$war <- NULL
#Filter(function(x) !x %in% c(url_list$adventure, url_list$action, url_list$drama, url_list$crime, url_list$horror, url_list$mystery, url_list$scifi), url_list$thriller)
url_list$thriller <- NULL
scripts_less.RDS
.# tic()
#
# if (!file.exists("./datos/scripts_less.RDS")) {
# scripts <- list()
# for(genre in names(url_list)) {
# script_list <- lapply(url_list[[genre]], script_vec)
# scripts[[genre]] <- Reduce("c", script_list)
# }
# write_rds(scripts, "./datos/scripts_less.RDS")
# }
#
# toc()