# CREATE TESTING DATA FILE # Sample size: nl = 1000 # Number of lines to extract # Libraries: library(data.table) # Open connections to source data files: conBlogs = file("../Data/en_US/en_US.blogs.txt", open="rb") conNews = file("../Data/en_US/en_US.news.txt", open="rb") conTwitter = file("../Data/en_US/en_US.twitter.txt", open="rb") # Start timer: t0 = Sys.time() # Read entire data files: Blogs = readLines(conBlogs, skipNul=T) # 18 sec close(conBlogs) length(Blogs) # 899,288 News = readLines(conNews, skipNul=T) # 17 sec close(conNews) length(News) # 1,010,242 Twitter = readLines(conTwitter, skipNul=T) # 23 sec close(conTwitter) length(Twitter) # 2,360,148 # Create entire data set vector: dat = c(Blogs, News, Twitter) length(dat) # Extract sample for testing data: set.seed(2) sdat = sample(dat, nl) # Write into sample file: conSample = file("../Data/Test_1K.txt") writeLines(sdat, conSample) close(conSample) # Display elapsed time: Sys.time() - t0