第3章のスクリプト
#### Chapter 3: A brief introduction to R
# Note: The lines are not preceded by "> " here since
# (i) the contrast between what you enter and between what R outputs does not have to be highlighted - below you will only find lines you will enter, and
# (ii) I would like you to be able to copy and paste directly without having to delete line-initial "> " all the time.
mean(c(1, 2, 3))
2- 3
library(corpora )
### Section 3.1: A few central notions: data structures, functions, and arguments 2+2 3^2 (2+3)^2
sqrt(5)
log(150, 10)
log(x=150, base=10) # with argument labels
Log(150, 10) # R does not know this function - it only knows "log"
aa<-sqrt(5) # compute the square root of 5
ls()
aa
(aa<-sqrt(5))
(aa<-aa+2)
sqrt(9); sqrt(16)
rm(aa) # delete aa
rm(list=ls(all=TRUE)) # delete all data structures
x<-c(1:10)
sample(x, size=5, replace=T, prob=NULL)
sample(x, 5, T, NULL)
sample(x, 5, T)
sample(x, 5, F)
sample(x, 5)
x sample(x)
sample(10)
q()
### Section 3.2: Vectors
## Section 3.2.1: Basics sqrt(5)
aa<-sqrt(5) # compute the square root of 5 is.vector(aa) class(aa)
length(aa)
(empty<-vector(length=3)) # create an 'empty' vector of a user-defined length
(a.name<-"James") class(a.name)
length(a.name)
(numbers<-c(1, 2, 3)) (names<-c("James", "Jonathan", "Jean-Luc"))
numbers1<-c(1, 2, 3); numbers2<-c(4, 5, 6) # generate two vectors (numbers1.and.numbers2<-c(numbers1, numbers2)) # join the two vectors (numbers1.and.numbers2<-append(numbers1, numbers2)) # another way to join the two vectors
numbers1+numbers2 # that is, 1+4, 2+5, 3+6
bb<-10 numbers1*bb
bb<-c(10, 20) numbers1*bb
names(numbers)<-c("first", "second", "third"); numbers
(mixture<-c(1, 2, "Benjamin"))
str(numbers1) str(mixture)
## Section 3.2.2: Loading vectors x<-scan(file="C:/_qclwr/_inputfiles/dat_vector-a.txt", sep="\n")
x.1<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char") x.2<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char", sep="\n")
x.1 x.2
filename<-select.list(dir(scan(nmax=1, what="char")), multiple=T)
x.1<-scan(file.choose(), what="char") x.2<-scan(file.choose(), what="char", sep="\n")
x<-scan() 1 2 3
x
## Section 3.2.3: Accessing and processing (parts of) vectors min(c(1, 2, 3)); max(c(1, 2, 3))
x<-c("a", "b", "c", "d", "e") x[3] # access the 3rd element of x
y<-3 x[y] # access the 3rd element of x
z<-c(1, 3) x[z] # access the 1st and the 3rd element of x just as x[c(1, 3)] would do
z<-c(1:3) x[z] # access the elements 1 to 3 of x
x[-2] # access x but without its 2nd element
x=="d"
(x<-c(10:1)) # generate and output a vector with the numbers from 10 to 1 x==4 # which elements of x are 4? x<=7 # which elements of x are smaller than or equal to 7? x!=8 # which elements of x are not 8? (x>8 | x<3) # which elements of x are larger than 8 or smaller than 3?
which(x==4) # which elements of x are 4?
which(x<=7) # which elements of x are less than or equal to 7? which(x!=8) # which elements of x are not 8? which(x>8 | x<3) # which elements of x are greater than 8 or less than 3?
(pointer<-which(x>8 | x<3)) (y<-x[pointer])
x[which(x>8 | x<3)] # output the elements of x which are greater than 8 or smaller than 3
x[x>8 | x<3] # output the elements of x which are greater than 8 or smaller than 3
length(which(x>8 | x<3)) # output the number of elements of x which are greater than 8 or smaller than 3
sum(x>8 | x<3) # output the number of elements of x which are greater than 8 or smaller than 3
x # output x y<-which(x>8) # store the positions of elements greater than 9 in the vector y x[y]<-12; x # replace the elements of x that are greater than 8 by 12
x<-c(10:1) # generate a vector with the numbers from 10 to 1 again x[which(x>8)]<-12; x # change the element(s) in x which are greater than 8 to 12
x<-c(10:1) # generate a vector with the numbers from 10 to 1 again x[x>8]<-12; x
x<-c(10:1); y<-c(2, 5, 9) # generate vectors again
x %in% y y %in% x
x[x %in% y]
match(x, y)
match(y, x)
setdiff(x, y) setdiff(y, x)
intersect(x, y) intersect(y, x)
union(x, y) union(y, x)
g<-c(1, 2, 3, 2, 3, 4, 3, 4, 5) h<-c(2, 3, 1, 5, 2, 6, 3, 1, 2) unique(g)
table(g)
table(g, h)
h sort(h, decreasing=T)
z<-c(3, 5, 10, 1, 6, 7, 8, 2, 4, 9) order(z, decreasing=F)
##############################################################
# You should now do "Exercise Box 3.1: Handling vectors" ... #
##############################################################
### Section 3.3: Factors f<-c("open", "open", "open", "closed", "closed") f (f<-factor(f)) is.factor(f)
### Section 3.4: Data frames
## Section 3.4.1: Generating data frames in R rm(list=ls(all=T)) PartOfSpeech?<-c("ADJ", "ADV", "N", "CONJ", "PREP") TokenFrequency?<-c(421, 337, 1411, 458, 455) TypeFrequency?<-c(271, 103, 735, 18, 37) Class<-c("open", "open", "open", "closed", "closed")
x<-data.frame(PartOfSpeech?, TokenFrequency?, TypeFrequency?, Class)
x str(x)
x$PartOfSpeech?
(x.2<-data.frame(TokenFrequency?, TypeFrequency?, Class, row.names=PartOfSpeech?)) str(x.2)
## Section 3.4.2: Loading and saving data frames in R rm(list=ls(all=T)) x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # no row.names: R numbers rows
x.2<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, row.names=1, sep="\t", comment.char="") # with row.names
write.table(x, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-a.txt"), quote=F, sep="\t", row.names=F)
write.table(x.2, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-b.txt"), quote=F, sep="\t", col.names=NA)
## Section 3.4.3: Accessing and processing (parts of) data frames in R x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="")
str(x) x$TokenFrequency? x$Class
attach(x)
Class
(TokenFrequency?[4]<-20)
x
TokenFrequency?[4]<-458 # change the value back to the old one
x[2,3] # the value of the second row and the third column x[2,] # all values of the second row (because no column is specified) x[,3] # all values of the third column (because no row is specified) x[2:3,4] # two values of the fourth column x[c(1,3), c(2,4)] # the 1st and 3rd row of the 2nd and 4th column
which(x[,2]>450) x[,3][which(x[,3]>100)] x[,3][x[,3]>100] TypeFrequency?[TypeFrequency?>100]
(y<-x[which(Class=="open"),]) # or shorter: (y<-x[Class=="open",])
(y<-x[which(x[,4]=="open"),]) # or shorter: (y<-x[x[,4]=="open",])
(y<-subset(x, Class=="open"))
(y<-subset(x, Class=="open" & TokenFrequency?<1000)) (y<-subset(x, PartOfSpeech? %in% c("ADJ", "ADV")))
(ordering.index<-order(Class, -TokenFrequency?))
x[ordering.index,]
x[order(Class, -TokenFrequency?),]
no.of.rows<-dim(x)[1] # for the columns: no.of.columns<-dim(x)[2]
ordering.index<-sample(no.of.rows); ordering.index
x[ordering.index,]
x[sample(dim(x)[1]),]
ordering.index<-order(-rank(Class), -rank(PartOfSpeech?)) x[ordering.index,]
##################################################################
# You should now do "Exercise Box 3.2: Handling data frames" ... #
##################################################################
### Section 3.5: Lists rm(list=ls(all=T)) a.vector<-c(1:10) # generates a vector with the numbers from one to ten a.dataframe<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # load the data frame from section 3.4 another.vector<-c("This", "may", "be", "a", "sentence", "from", "a", "corpus","file", ".") (a.list<-list(a.vector, a.dataframe, another.vector))
str(a.list)
a.list<-list(Part1=a.vector, Part2=a.dataframe, Part3=another.vector)
names(a.list)<-c("Part1", "Part2", "Part3")
a.list<-list(a.vector, a.dataframe, another.vector) # redefine a.list
is.list(a.list1?) is.vector(a.list1?) is.data.frame(a.list2?) is.vector(a.list3?)
a.list[1] is.list(a.list[1])
a.list1? a.list[1]
a.list$Part1 # or a.list[["Part1"]]
a.list[c(1,3)]
a.listc(1,3)? # take the third part of the first element of the list
a.list1?[3] # take the third part of the first element of the list
a.list1?[3:5] # consecutive parts a.list1?[c(3, 5)] # note that "a.list1?[3, 5]" does not work
a.list2?[3,c(2, 4)]
x<-a.list2? (y<-split(x, Class))
y$open
split(x, list(Class, PartOfSpeech?))
### Section 3.6: Elementary programming functions
## Section 3.6.1: Conditional expressions a<-2 # you can of course insert another number here if (a>2) {
cat("a is greater than 2.\n")
} else {
cat("a is not greater than 2.\n")
}
# this is an alternative: note the use of else if, which allows you to get rid of one additional level of embedding b<-"car" # you can of course insert another word here if (b=="automobile") {
cat("b is 'automobile'!\n")
} else if (b=="car") { # !
cat("b is 'car'!\n")
} else {
cat("b is neither 'automobile' nor 'car'.\n")
}
#####################################################################
# You should now do "Exercise Box 3.3: Conditional expressions" ... #
#####################################################################
## Section 3.6.2: Loops for (i in 1:3) {
cat(i, "\n")
}
j<-seq(2, 1, -0.5) for (i in j) {
cat(i, "\n")
}
for (i in 1:2) {
for (j in 6:7) { cat(i, "times", j, "is", i*j, "\n") }
}
for (i in 1:3) {
if (i==2) { next } cat(i, "\n")
}
for (i in 1:5) {
if (i==3) { break } cat(i, "\n")
}
i<-1 repeat {
cat(i, "\n") i<-i+1 # this way of incrementing a variable is extremely useful! if (i==3) { break }
}
i<-1 while (i<3) {
cat(i, "\n") i<-i+1
}
###################################################
# You should now do "Exercise Box 3.4: Loops" ... #
###################################################
## Section 3.6.3: Rules of programming PartOfSpeech?<-c("ADJ", "ADV", "N", "CONJ", "PREP") TokenFrequency?<-c(421, 337, 1411, 458, 455) TypeFrequency?<-c(271, 103, 735, 18, 37) Class<-c("open", "open", "open", "closed", "closed")
sum.closed<-0; sum.open<-0 # define two vectors for the results for (i in 1:5) {
current.class<-Class[i] # access each word class if (current.class=="closed") { sum.closed<-sum.closed+TokenFrequency[i] # if the current class is "closed", add its token frequency to the first result vector } else { sum.open<-sum.open+TokenFrequency[i] # if the current class is not "closed", add its token frequency to the other result vector } # end of if: test current class
} # end of for: access each word class sum.closed; sum.open # look at the output
sum(TokenFrequency?[which(Class=="closed")]) sum(TokenFrequency?[which(Class=="open")])
tapply(TokenFrequency?, Class, sum)
####################################################
# You should now do "Exercise Box 3.5: tapply" ... #
####################################################
another.list<-list(c(1), c(2, 3), c(4, 5, 6), c(7, 8, 9, 10))
lengths<-vector() for (i in 1:length(another.list)) {
lengths[i]<-length(another.list [[i]])
} lengths
sapply(another.list, length)
first.elements<-vector() for (i in 1:length(another.list)) {
first.elements[i]<-another.list[[i]][1]
} first.elements
sapply(another.list, "[", 1)
sapply(a.list, "[", 1) # use the list we generated in Section 3.5
a<-c(1, 5, 3); b<-c(2, 6, 4); (ab<-list(a, b)) lapply(ab, sort, decreasing=F) # decreasing=F is passed on to sort
### Section 3.7: Character/string processing
## Section 3.7.1: Getting information from and accessing (vectors of) character strings example<-c("I", "do", "not", "know") nchar(example)
substr("internationalization", 6, 13)
substr(example, 2, 3)
some.first.vector<-c("abcd", "efgh") some.other.vector<-c("ijkl", "mnop") substr(c(some.first.vector, some.other.vector), c(1, 2, 3, 4), c(2, 3, 4, 4))
## Section 3.7.2: Elementary ways to change (vectors of) character strings tolower(example) toupper(example)
chartr("o", "x", example)
## Section 3.7.3: Merging and splitting (vectors of) character strings without regular expressions paste("I", "do", "not", "know", sep=" ") paste("I", "do", "not", "know", collapse=" ") # same result paste("I", "do", "not", "know", sep=" ", collapse=" ") # same result
paste(example, sep=" ")
# with a longer vector, there is a difference paste(example, sep=" ", collapse=" ") # but sep=" " is the default of paste and can be omitted paste(example, collapse=" ")
example.2<-"I do not know" strsplit(example.2, " ")
strsplit(example.2, "")
example.3<-c("This is the first character string", "This is the second character string") strsplit(example.3, " ")
unlist(strsplit(example.3, " "))
## Section 3.7.4: Searching and replacing without regular expressions text<-c("This is a first example sentence.", "And this is a second example sentence.")
grep("second", text)
text[grep("second", text)]
grep("second", text, value=T)
grep("eco", text, value=T)
grep("is", text)
regexpr("second", text)
attributes(regexpr("second", text)) # returns a list!
attr(regexpr("second", text), "match.length") # returns a vector!
regexpr("e", text)
gregexpr("e", text)
gregexpr("e", text)1? # and of course the same with 2?
unlist(gregexpr("e", text)[1]) # and of course the same with [2]
sapply(gregexpr("e", text), c)
unlist(gregexpr("e", text))
attributes(gregexpr("e", text)) attributes(gregexpr("e", text)[1])
gregexpr("e", text)1?
attributes(gregexpr("e", text)1?) # returns a list, same with 2? unlist(attributes(gregexpr("e", text)1?))
attr(gregexpr("e", text)1?, "match.length") # returns a vector, same with 2?
sapply(gregexpr("e", text), attributes)
sapply(gregexpr("e", text), attr, "match.length")
unlist(sapply(gregexpr("e", text), attr, "match.length"))
library(gsubfn) text<-c("This is a first example sentence.", "And this is a second example sentence.") strapply(text, "first") strapply(text, "is")
gsub("a", "the", text)
gsub("a", "the", text)
gsub(" a ", " the ", text)
text
(text.2<-gsub(" a ", " the ", text))
## Section 3.7.5: Searching and replacing with regular expressions grep("^t", text, ignore.case=T, perl=T, value=T)
grep("s.c", text, ignore.case=T, perl=T, value=T)
grep("f...t", text, ignore.case=T, perl=T, value=T)
gsub("\\.", "!", text, perl=T)
colors<-c("color", "colour") grep("colou?r", colors, perl=T, value=T)
some.text.line<-"This is just one example." gsub(" +", " ", some.text.line, perl=T)
grep("colou{0,1}r", colors, perl=T, value=T)
some.vector<-c("a st b", "a stst b", "a st st b") grep("(st){2,3}", some.vector, perl=T, value=T)
grep("(.t){2,3}", some.vector, perl=T, value=T)
gsub("a (first|second)", "another", text, ignore.case=T, perl=T)
gsub("(a|e|i|o|u)", "V", text, ignore.case=T, perl=T)
gsub("[aeiou]", "V", text, ignore.case=T, perl=T)
gsub("[a-ht-z]", "X", text, ignore.case=T, perl=T)
gsub("[1-5]", "3", "0123456789", perl=T)
#######################################################################
# You should now do "Exercise Box 3.6: a few regular expressions" ... #
#######################################################################
gsub("[^1-5]", "3", "0123456789", perl=T)
gsub("\\d", "3", "a1b2c3d4e5", perl=T)
gsub("\\D", "3", "a1b2c3d4e5", perl=T)
gsub("\\w+", "WRD", text, perl=T)
gsub("\\w\\W", "<WB>", text, perl=T) gsub("\\W\\w", "<WB>", text, perl=T) gsub("\\b", "<WB>", text, perl=T)
(the.match<-gregexpr("s.*s", text[1], perl=T))
substr(text[1], unlist(the.match), unlist(the.match)+attr(the.match1?, "match.length")-1)
some.corpus.line<-"he said, you are lazy and you are stupid." gregexpr("you.*are", some.corpus.line, perl=T)
gregexpr("(?U)you.*are", some.corpus.line, perl=T)
gregexpr("you.*?are", some.corpus.line, perl=T)
gregexpr("s.*?s", text[1], perl=T)
(text.2<-gsub("\\w+", "<w>", text, perl=T))
(text.2<-gsub("\\b", "<w>", text, perl=T))
(text.2<-gsub("(\\w+)", "\\1<w>", text, perl=T))
(text.3<-gsub("([!:,\\.\\?])", "\\1<p>", text.2, perl=T))
American.dates<-c("5/15/1976", "2.15.1970", "1.9.2006")
(British.dates<-sub("(\\d{1,2})\\D(\\d{1,2})\\D", "\\2/\\1/", American.dates, perl=T))
(British.dates<-sub("(\\d{1,2})(\\D)(\\d{1,2})(\\D)", "\\3\\4\\1\\2", American.dates, perl=T))
gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", text, perl=T)
gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", "This not is my dog", perl=T)
gsub("(\\w+?)(\\W+\\w*?)\\1\\b", "\\1<r>\\2\\1<r>", text, perl=T)
example<-c("abcd", "abcde", "abcdf")
grep("abc(?=de)", example, perl=T)
grep("abcde", example, perl=T)
gsub("abcde", "xyz", example, perl=T)
gsub("abc(?=de)", "xyz", example, perl=T)
gsub("(.)(?=e)", "\\1\\1", example, perl=T)
text
gregexpr("s.*?(?=s)", text[1], perl=T)
gsub("d(?!e)", "D", example, perl=T)
gsub("d[^e]", "D", example, perl=T)
example1<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <ptr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example1, perl=T)
example2<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <wtr target=KB0LC004></u>" gsub("<[^wc][^ ].*?>", "", example2, perl=T)
example3<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <p tr target=KB0LC004></u>" gsub("<[^wc][^ ].*?>", "", example3, perl=T)
gsub("<(?![wc] ...(-...)?>).*?>", "", example1, perl=T)
gsub("<(?![wc] ...(-...)?>).*?>[^<]*", "", example1, perl=T)
text<-c("This is a first example sentence.", "And this is a second example sentence.")
strapply(text[1], "first")
strapply(text, "[a-z]*i[a-z]*", ignore.case=T, perl=T)
strapply(text, "[a-z]*i[a-z]*", nchar, ignore.case=T, perl=T)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=-1)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=1)
############################################################################
# You should now do "Exercise Box 3.7: a few more regular expressions" ... #
############################################################################
## Section 3.7.6: Merging and splitting (vectors of) character strings with regular expressions tagtext<-"<w DPS>my <w NN1>mum <w CJC>and <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<w ...>", perl=T))
tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] ...>", perl=T))
tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] (...|...-...)>", perl=T))
unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))
tagtext<-"<w DPS>my <w NN1>service <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))
table(unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T)))
unlist(strsplit(tagtext, " *<[wc] .{3,7}> *", perl=T))
unlist(strapply(tagtext, "<[^<]*", perl=T))
### Section 3.8: File and directory operations getwd()
setwd("C:/_qclwr") # if you don't get a response, it's done
dir("C:/_qclwr/BNCwe", pattern="^D")
basename("C:/_qclwr/test.txt")
file.info(dir())
# to plot into a file png(file="C:/_qclwr/_outputfiles/03-8_graph.png", width=600, height=600)
plot(1:10, 1:10, type="b")
dev.off()