- 追加された行はこの色です。
- 削除された行はこの色です。
第3章のスクリプト
#### Chapter 3: A brief introduction to R
# Note: The lines are not preceded by "> " here since
# (i) the contrast between what you enter and between what R outputs does not have to be highlighted - below you will only find lines you will enter, and
# (ii) I would like you to be able to copy and paste directly without having to delete line-initial "> " all the time.
mean(c(1, 2, 3))
2-
3
library(corpora
)
### Section 3.1: A few central notions: data structures, functions, and arguments
2+2
3^2
(2+3)^2
sqrt(5)
log(150, 10)
log(x=150, base=10) # with argument labels
Log(150, 10) # R does not know this function - it only knows "log"
aa<-sqrt(5) # compute the square root of 5
ls()
aa
(aa<-sqrt(5))
(aa<-aa+2)
sqrt(9); sqrt(16)
rm(aa) # delete aa
rm(list=ls(all=TRUE)) # delete all data structures
x<-c(1:10)
sample(x, size=5, replace=T, prob=NULL)
sample(x, 5, T, NULL)
sample(x, 5, T)
sample(x, 5, F)
sample(x, 5)
x
sample(x)
sample(10)
q()
### Section 3.2: Vectors
## Section 3.2.1: Basics
sqrt(5)
aa<-sqrt(5) # compute the square root of 5
is.vector(aa)
class(aa)
length(aa)
(empty<-vector(length=3)) # create an 'empty' vector of a user-defined length
(a.name<-"James")
class(a.name)
length(a.name)
(numbers<-c(1, 2, 3))
(names<-c("James", "Jonathan", "Jean-Luc"))
numbers1<-c(1, 2, 3); numbers2<-c(4, 5, 6) # generate two vectors
(numbers1.and.numbers2<-c(numbers1, numbers2)) # join the two vectors
(numbers1.and.numbers2<-append(numbers1, numbers2)) # another way to join the two vectors
numbers1+numbers2 # that is, 1+4, 2+5, 3+6
bb<-10
numbers1*bb
bb<-c(10, 20)
numbers1*bb
names(numbers)<-c("first", "second", "third"); numbers
(mixture<-c(1, 2, "Benjamin"))
str(numbers1)
str(mixture)
## Section 3.2.2: Loading vectors
x<-scan(file="C:/_qclwr/_inputfiles/dat_vector-a.txt", sep="\n")
x.1<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char")
x.2<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char", sep="\n")
x.1
x.2
filename<-select.list(dir(scan(nmax=1, what="char")), multiple=T)
x.1<-scan(file.choose(), what="char")
x.2<-scan(file.choose(), what="char", sep="\n")
x<-scan()
1
2
3
x
## Section 3.2.3: Accessing and processing (parts of) vectors
min(c(1, 2, 3)); max(c(1, 2, 3))
x<-c("a", "b", "c", "d", "e")
x[3] # access the 3rd element of x
y<-3
x[y] # access the 3rd element of x
z<-c(1, 3)
x[z] # access the 1st and the 3rd element of x just as x[c(1, 3)] would do
z<-c(1:3)
x[z] # access the elements 1 to 3 of x
x[-2] # access x but without its 2nd element
x=="d"
(x<-c(10:1)) # generate and output a vector with the numbers from 10 to 1
x==4 # which elements of x are 4?
x<=7 # which elements of x are smaller than or equal to 7?
x!=8 # which elements of x are not 8?
(x>8 | x<3) # which elements of x are larger than 8 or smaller than 3?
which(x==4) # which elements of x are 4?
which(x<=7) # which elements of x are less than or equal to 7?
which(x!=8) # which elements of x are not 8?
which(x>8 | x<3) # which elements of x are greater than 8 or less than 3?
(pointer<-which(x>8 | x<3))
(y<-x[pointer])
x[which(x>8 | x<3)] # output the elements of x which are greater than 8 or smaller than 3
x[x>8 | x<3] # output the elements of x which are greater than 8 or smaller than 3
length(which(x>8 | x<3)) # output the number of elements of x which are greater than 8 or smaller than 3
sum(x>8 | x<3) # output the number of elements of x which are greater than 8 or smaller than 3
x # output x
y<-which(x>8) # store the positions of elements greater than 9 in the vector y
x[y]<-12; x # replace the elements of x that are greater than 8 by 12
x<-c(10:1) # generate a vector with the numbers from 10 to 1 again
x[which(x>8)]<-12; x # change the element(s) in x which are greater than 8 to 12
x<-c(10:1) # generate a vector with the numbers from 10 to 1 again
x[x>8]<-12; x
x<-c(10:1); y<-c(2, 5, 9) # generate vectors again
x %in% y
y %in% x
x[x %in% y]
match(x, y)
match(y, x)
setdiff(x, y)
setdiff(y, x)
intersect(x, y)
intersect(y, x)
union(x, y)
union(y, x)
g<-c(1, 2, 3, 2, 3, 4, 3, 4, 5)
h<-c(2, 3, 1, 5, 2, 6, 3, 1, 2)
unique(g)
table(g)
table(g, h)
h
sort(h, decreasing=T)
z<-c(3, 5, 10, 1, 6, 7, 8, 2, 4, 9)
order(z, decreasing=F)
##############################################################
# You should now do "Exercise Box 3.1: Handling vectors" ... #
##############################################################
### Section 3.3: Factors
f<-c("open", "open", "open", "closed", "closed")
f
(f<-factor(f))
is.factor(f)
### Section 3.4: Data frames
## Section 3.4.1: Generating data frames in R
rm(list=ls(all=T))
PartOfSpeech<-c("ADJ", "ADV", "N", "CONJ", "PREP")
TokenFrequency<-c(421, 337, 1411, 458, 455)
TypeFrequency<-c(271, 103, 735, 18, 37)
Class<-c("open", "open", "open", "closed", "closed")
x<-data.frame(PartOfSpeech, TokenFrequency, TypeFrequency, Class)
x
str(x)
x$PartOfSpeech
(x.2<-data.frame(TokenFrequency, TypeFrequency, Class, row.names=PartOfSpeech))
str(x.2)
## Section 3.4.2: Loading and saving data frames in R
rm(list=ls(all=T))
x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # no row.names: R numbers rows
x.2<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, row.names=1, sep="\t", comment.char="") # with row.names
write.table(x, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-a.txt"), quote=F, sep="\t", row.names=F)
write.table(x.2, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-b.txt"), quote=F, sep="\t", col.names=NA)
## Section 3.4.3: Accessing and processing (parts of) data frames in R
x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="")
str(x)
x$TokenFrequency
x$Class
attach(x)
Class
(TokenFrequency[4]<-20)
x
TokenFrequency[4]<-458 # change the value back to the old one
x[2,3] # the value of the second row and the third column
x[2,] # all values of the second row (because no column is specified)
x[,3] # all values of the third column (because no row is specified)
x[2:3,4] # two values of the fourth column
x[c(1,3), c(2,4)] # the 1st and 3rd row of the 2nd and 4th column
which(x[,2]>450)
x[,3][which(x[,3]>100)]
x[,3][x[,3]>100]
TypeFrequency[TypeFrequency>100]
(y<-x[which(Class=="open"),]) # or shorter: (y<-x[Class=="open",])
(y<-x[which(x[,4]=="open"),]) # or shorter: (y<-x[x[,4]=="open",])
(y<-subset(x, Class=="open"))
(y<-subset(x, Class=="open" & TokenFrequency<1000))
(y<-subset(x, PartOfSpeech %in% c("ADJ", "ADV")))
(ordering.index<-order(Class, -TokenFrequency))
x[ordering.index,]
x[order(Class, -TokenFrequency),]
no.of.rows<-dim(x)[1] # for the columns: no.of.columns<-dim(x)[2]
ordering.index<-sample(no.of.rows); ordering.index
x[ordering.index,]
x[sample(dim(x)[1]),]
ordering.index<-order(-rank(Class), -rank(PartOfSpeech))
x[ordering.index,]
##################################################################
# You should now do "Exercise Box 3.2: Handling data frames" ... #
##################################################################
### Section 3.5: Lists
rm(list=ls(all=T))
a.vector<-c(1:10) # generates a vector with the numbers from one to ten
a.dataframe<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # load the data frame from section 3.4
another.vector<-c("This", "may", "be", "a", "sentence", "from", "a", "corpus","file", ".")
(a.list<-list(a.vector, a.dataframe, another.vector))
str(a.list)
a.list<-list(Part1=a.vector, Part2=a.dataframe, Part3=another.vector)
names(a.list)<-c("Part1", "Part2", "Part3")
a.list<-list(a.vector, a.dataframe, another.vector) # redefine a.list
a.list[[1]]
a.list[[2]]
a.list[[3]]
is.list(a.list[[1]])
is.vector(a.list[[1]])
is.data.frame(a.list[[2]])
is.vector(a.list[[3]])
a.list[1]
is.list(a.list[1])
a.list[[1]]
a.list[1]
a.list$Part1 # or a.list[["Part1"]]
a.list[c(1,3)]
a.list[[c(1,3)]] # take the third part of the first element of the list
a.list[[1]][3] # take the third part of the first element of the list
a.list[[1]][3:5] # consecutive parts
a.list[[1]][c(3, 5)] # note that "a.list[[1]][3, 5]" does not work
a.list[[2]][3,2]
a.list[[2]][3,2:4]
a.list[[2]][3,c(2, 4)]
x<-a.list[[2]]
(y<-split(x, Class))
y$open
split(x, list(Class, PartOfSpeech))
### Section 3.6: Elementary programming functions
## Section 3.6.1: Conditional expressions
a<-2 # you can of course insert another number here
if (a>2) {
cat("a is greater than 2.\n")
} else {
cat("a is not greater than 2.\n")
}
# this is an alternative: note the use of else if, which allows you to get rid of one additional level of embedding
b<-"car" # you can of course insert another word here
if (b=="automobile") {
cat("b is 'automobile'!\n")
} else if (b=="car") { # !
cat("b is 'car'!\n")
} else {
cat("b is neither 'automobile' nor 'car'.\n")
}
#####################################################################
# You should now do "Exercise Box 3.3: Conditional expressions" ... #
#####################################################################
## Section 3.6.2: Loops
for (i in 1:3) {
cat(i, "\n")
}
j<-seq(2, 1, -0.5)
for (i in j) {
cat(i, "\n")
}
for (i in 1:2) {
for (j in 6:7) {
cat(i, "times", j, "is", i*j, "\n")
}
}
for (i in 1:3) {
if (i==2) {
next
}
cat(i, "\n")
}
for (i in 1:5) {
if (i==3) {
break
}
cat(i, "\n")
}
i<-1
repeat {
cat(i, "\n")
i<-i+1 # this way of incrementing a variable is extremely useful!
if (i==3) { break }
}
i<-1
while (i<3) {
cat(i, "\n")
i<-i+1
}
###################################################
# You should now do "Exercise Box 3.4: Loops" ... #
###################################################
## Section 3.6.3: Rules of programming
PartOfSpeech<-c("ADJ", "ADV", "N", "CONJ", "PREP")
TokenFrequency<-c(421, 337, 1411, 458, 455)
TypeFrequency<-c(271, 103, 735, 18, 37)
Class<-c("open", "open", "open", "closed", "closed")
sum.closed<-0; sum.open<-0 # define two vectors for the results
for (i in 1:5) {
current.class<-Class[i] # access each word class
if (current.class=="closed") {
sum.closed<-sum.closed+TokenFrequency[i] # if the current class is "closed", add its token frequency to the first result vector
} else {
sum.open<-sum.open+TokenFrequency[i] # if the current class is not "closed", add its token frequency to the other result vector
} # end of if: test current class
} # end of for: access each word class
sum.closed; sum.open # look at the output
sum(TokenFrequency[which(Class=="closed")])
sum(TokenFrequency[which(Class=="open")])
tapply(TokenFrequency, Class, sum)
####################################################
# You should now do "Exercise Box 3.5: tapply" ... #
####################################################
another.list<-list(c(1), c(2, 3), c(4, 5, 6), c(7, 8, 9, 10))
lengths<-vector()
for (i in 1:length(another.list)) {
lengths[i]<-length(another.list [[i]])
}
lengths
sapply(another.list, length)
first.elements<-vector()
for (i in 1:length(another.list)) {
first.elements[i]<-another.list[[i]][1]
}
first.elements
sapply(another.list, "[", 1)
sapply(a.list, "[", 1) # use the list we generated in Section 3.5
a<-c(1, 5, 3); b<-c(2, 6, 4); (ab<-list(a, b))
lapply(ab, sort, decreasing=F) # decreasing=F is passed on to sort
### Section 3.7: Character/string processing
## Section 3.7.1: Getting information from and accessing (vectors of) character strings
example<-c("I", "do", "not", "know")
nchar(example)
substr("internationalization", 6, 13)
substr(example, 2, 3)
some.first.vector<-c("abcd", "efgh")
some.other.vector<-c("ijkl", "mnop")
substr(c(some.first.vector, some.other.vector), c(1, 2, 3, 4), c(2, 3, 4, 4))
## Section 3.7.2: Elementary ways to change (vectors of) character strings
tolower(example)
toupper(example)
chartr("o", "x", example)
## Section 3.7.3: Merging and splitting (vectors of) character strings without regular expressions
paste("I", "do", "not", "know", sep=" ")
paste("I", "do", "not", "know", collapse=" ") # same result
paste("I", "do", "not", "know", sep=" ", collapse=" ") # same result
paste(example, sep=" ")
# with a longer vector, there is a difference
paste(example, sep=" ", collapse=" ") # but sep=" " is the default of paste and can be omitted
paste(example, collapse=" ")
example.2<-"I do not know"
strsplit(example.2, " ")
strsplit(example.2, "")
example.3<-c("This is the first character string", "This is the second character string")
strsplit(example.3, " ")
unlist(strsplit(example.3, " "))
## Section 3.7.4: Searching and replacing without regular expressions
text<-c("This is a first example sentence.", "And this is a second example sentence.")
grep("second", text)
text[grep("second", text)]
grep("second", text, value=T)
grep("eco", text, value=T)
grep("is", text)
regexpr("second", text)
attributes(regexpr("second", text)) # returns a list!
attr(regexpr("second", text), "match.length") # returns a vector!
regexpr("e", text)
gregexpr("e", text)
gregexpr("e", text)[[1]] # and of course the same with [[2]]
unlist(gregexpr("e", text)[1]) # and of course the same with [2]
sapply(gregexpr("e", text), c)
unlist(gregexpr("e", text))
attributes(gregexpr("e", text))
attributes(gregexpr("e", text)[1])
gregexpr("e", text)[[1]]
attributes(gregexpr("e", text)[[1]]) # returns a list, same with [[2]]
unlist(attributes(gregexpr("e", text)[[1]]))
attr(gregexpr("e", text)[[1]], "match.length") # returns a vector, same with [[2]]
sapply(gregexpr("e", text), attributes)
sapply(gregexpr("e", text), attr, "match.length")
unlist(sapply(gregexpr("e", text), attr, "match.length"))
library(gsubfn)
text<-c("This is a first example sentence.", "And this is a second example sentence.")
strapply(text, "first")
strapply(text, "is")
gsub("a", "the", text)
gsub("a", "the", text)
gsub(" a ", " the ", text)
text
(text.2<-gsub(" a ", " the ", text))
## Section 3.7.5: Searching and replacing with regular expressions
grep("^t", text, ignore.case=T, perl=T, value=T)
grep("s.c", text, ignore.case=T, perl=T, value=T)
grep("f...t", text, ignore.case=T, perl=T, value=T)
gsub("\\.", "!", text, perl=T)
colors<-c("color", "colour")
grep("colou?r", colors, perl=T, value=T)
some.text.line<-"This is just one example."
gsub(" +", " ", some.text.line, perl=T)
grep("colou{0,1}r", colors, perl=T, value=T)
some.vector<-c("a st b", "a stst b", "a st st b")
grep("(st){2,3}", some.vector, perl=T, value=T)
grep("(.t){2,3}", some.vector, perl=T, value=T)
gsub("a (first|second)", "another", text, ignore.case=T, perl=T)
gsub("(a|e|i|o|u)", "V", text, ignore.case=T, perl=T)
gsub("[aeiou]", "V", text, ignore.case=T, perl=T)
gsub("[a-ht-z]", "X", text, ignore.case=T, perl=T)
gsub("[1-5]", "3", "0123456789", perl=T)
#######################################################################
# You should now do "Exercise Box 3.6: a few regular expressions" ... #
#######################################################################
gsub("[^1-5]", "3", "0123456789", perl=T)
gsub("\\d", "3", "a1b2c3d4e5", perl=T)
gsub("\\D", "3", "a1b2c3d4e5", perl=T)
gsub("\\w+", "WRD", text, perl=T)
gsub("\\w\\W", "<WB>", text, perl=T)
gsub("\\W\\w", "<WB>", text, perl=T)
gsub("\\b", "<WB>", text, perl=T)
(the.match<-gregexpr("s.*s", text[1], perl=T))
substr(text[1], unlist(the.match), unlist(the.match)+attr(the.match[[1]], "match.length")-1)
some.corpus.line<-"he said, you are lazy and you are stupid."
gregexpr("you.*are", some.corpus.line, perl=T)
gregexpr("(?U)you.*are", some.corpus.line, perl=T)
gregexpr("you.*?are", some.corpus.line, perl=T)
gregexpr("s.*?s", text[1], perl=T)
(text.2<-gsub("\\w+", "<w>", text, perl=T))
(text.2<-gsub("\\b", "<w>", text, perl=T))
(text.2<-gsub("(\\w+)", "\\1<w>", text, perl=T))
(text.3<-gsub("([!:,\\.\\?])", "\\1<p>", text.2, perl=T))
American.dates<-c("5/15/1976", "2.15.1970", "1.9.2006")
(British.dates<-sub("(\\d{1,2})\\D(\\d{1,2})\\D", "\\2/\\1/", American.dates, perl=T))
(British.dates<-sub("(\\d{1,2})(\\D)(\\d{1,2})(\\D)", "\\3\\4\\1\\2", American.dates, perl=T))
gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", text, perl=T)
gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", "This not is my dog", perl=T)
gsub("(\\w+?)(\\W+\\w*?)\\1\\b", "\\1<r>\\2\\1<r>", text, perl=T)
example<-c("abcd", "abcde", "abcdf")
grep("abc(?=de)", example, perl=T)
grep("abcde", example, perl=T)
gsub("abcde", "xyz", example, perl=T)
gsub("abc(?=de)", "xyz", example, perl=T)
gsub("(.)(?=e)", "\\1\\1", example, perl=T)
text
gregexpr("s.*?(?=s)", text[1], perl=T)
gsub("d(?!e)", "D", example, perl=T)
gsub("d[^e]", "D", example, perl=T)
example1<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <ptr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example1, perl=T)
example2<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <wtr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example2, perl=T)
example3<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <p tr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example3, perl=T)
gsub("<(?![wc] ...(-...)?>).*?>", "", example1, perl=T)
gsub("<(?![wc] ...(-...)?>).*?>[^<]*", "", example1, perl=T)
text<-c("This is a first example sentence.", "And this is a second example sentence.")
strapply(text[1], "first")
strapply(text, "[a-z]*i[a-z]*", ignore.case=T, perl=T)
strapply(text, "[a-z]*i[a-z]*", nchar, ignore.case=T, perl=T)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=-1)
strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=1)
############################################################################
# You should now do "Exercise Box 3.7: a few more regular expressions" ... #
############################################################################
## Section 3.7.6: Merging and splitting (vectors of) character strings with regular expressions
tagtext<-"<w DPS>my <w NN1>mum <w CJC>and <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<w ...>", perl=T))
tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] ...>", perl=T))
tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] (...|...-...)>", perl=T))
unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))
tagtext<-"<w DPS>my <w NN1>service <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"
unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))
table(unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T)))
unlist(strsplit(tagtext, " *<[wc] .{3,7}> *", perl=T))
unlist(strapply(tagtext, "<[^<]*", perl=T))
### Section 3.8: File and directory operations
getwd()
setwd("C:/_qclwr") # if you don't get a response, it's done
dir("C:/_qclwr/BNCwe", pattern="^D")
basename("C:/_qclwr/test.txt")
file.info(dir())
# to plot into a file
png(file="C:/_qclwr/_outputfiles/03-8_graph.png", width=600, height=600)
plot(1:10, 1:10, type="b")
dev.off()