第3章のスクリプト

#### Chapter 3: A brief introduction to R
# Note: The lines are not preceded by "> " here since
# (i) the contrast between what you enter and between what R outputs does not have to be highlighted - below you will only find lines you will enter, and
# (ii) I would like you to be able to copy and paste directly without having to delete line-initial "> " all the time.

mean(c(1, 2, 3))

2-
3

library(corpora
)



### Section 3.1: A few central notions: data structures, functions, and arguments
2+2
3^2
(2+3)^2

sqrt(5)

log(150, 10)

log(x=150, base=10) # with argument labels

Log(150, 10) # R does not know this function - it only knows "log"

aa<-sqrt(5) # compute the square root of 5

ls()

aa

(aa<-sqrt(5))

(aa<-aa+2)

sqrt(9); sqrt(16)

rm(aa) # delete aa

rm(list=ls(all=TRUE)) # delete all data structures

x<-c(1:10)

sample(x, size=5, replace=T, prob=NULL)

sample(x, 5, T, NULL)

sample(x, 5, T)

sample(x, 5, F)

sample(x, 5)

x
sample(x)

sample(10)

q()



### Section 3.2: Vectors
## Section 3.2.1: Basics
sqrt(5)

aa<-sqrt(5) # compute the square root of 5
is.vector(aa)
class(aa)

length(aa)

(empty<-vector(length=3)) # create an 'empty' vector of a user-defined length

(a.name<-"James")
class(a.name)

length(a.name)

(numbers<-c(1, 2, 3))
(names<-c("James", "Jonathan", "Jean-Luc"))

numbers1<-c(1, 2, 3); numbers2<-c(4, 5, 6) # generate two vectors
(numbers1.and.numbers2<-c(numbers1, numbers2)) # join the two vectors
(numbers1.and.numbers2<-append(numbers1, numbers2)) # another way to join the two vectors

numbers1+numbers2 # that is, 1+4, 2+5, 3+6

bb<-10
numbers1*bb

bb<-c(10, 20)
numbers1*bb

names(numbers)<-c("first", "second", "third"); numbers

(mixture<-c(1, 2, "Benjamin"))

str(numbers1)
str(mixture)



## Section 3.2.2: Loading vectors
x<-scan(file="C:/_qclwr/_inputfiles/dat_vector-a.txt", sep="\n")

x.1<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char")
x.2<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char", sep="\n")

x.1
x.2

filename<-select.list(dir(scan(nmax=1, what="char")), multiple=T)

x.1<-scan(file.choose(), what="char")
x.2<-scan(file.choose(), what="char", sep="\n")

x<-scan()
1
2
3

x



## Section 3.2.3: Accessing and processing (parts of) vectors
min(c(1, 2, 3)); max(c(1, 2, 3))

x<-c("a", "b", "c", "d", "e")
x[3] # access the 3rd element of x

y<-3
x[y] # access the 3rd element of x

z<-c(1, 3)
x[z] # access the 1st and the 3rd element of x just as x[c(1, 3)] would do

z<-c(1:3)
x[z] # access the elements 1 to 3 of x

x[-2] # access x but without its 2nd element

x=="d"

(x<-c(10:1)) # generate and output a vector with the numbers from 10 to 1
x==4 # which elements of x are 4?
x<=7 # which elements of x are smaller than or equal to 7?
x!=8 # which elements of x are not 8?
(x>8 | x<3) # which elements of x are larger than 8 or smaller than 3?

which(x==4) # which elements of x are 4?

which(x<=7) # which elements of x are less than or equal to 7?
which(x!=8) # which elements of x are not 8?
which(x>8 | x<3) # which elements of x are greater than 8 or less than 3?

(pointer<-which(x>8 | x<3))
(y<-x[pointer])

x[which(x>8 | x<3)] # output the elements of x which are greater than 8 or smaller than 3

x[x>8 | x<3] # output the elements of x which are greater than 8 or smaller than 3

length(which(x>8 | x<3)) # output the number of elements of x which are greater than 8 or smaller than 3

sum(x>8 | x<3) # output the number of elements of x which are greater than 8 or smaller than 3

x # output x
y<-which(x>8) # store the positions of elements greater than 9 in the vector y
x[y]<-12; x # replace the elements of x that are greater than 8 by 12

x<-c(10:1) # generate a vector with the numbers from 10 to 1 again
x[which(x>8)]<-12; x # change the element(s) in x which are greater than 8 to 12

x<-c(10:1) # generate a vector with the numbers from 10 to 1 again
x[x>8]<-12; x

x<-c(10:1); y<-c(2, 5, 9) # generate vectors again

x %in% y
y %in% x

x[x %in% y]

match(x, y)

match(y, x)

setdiff(x, y)
setdiff(y, x)

intersect(x, y)
intersect(y, x)

union(x, y)
union(y, x)

g<-c(1, 2, 3, 2, 3, 4, 3, 4, 5)
h<-c(2, 3, 1, 5, 2, 6, 3, 1, 2)
unique(g)

table(g)

table(g, h)

h
sort(h, decreasing=T)

z<-c(3, 5, 10, 1, 6, 7, 8, 2, 4, 9)
order(z, decreasing=F)

##############################################################
# You should now do "Exercise Box 3.1: Handling vectors" ... #
##############################################################



### Section 3.3: Factors
f<-c("open", "open", "open", "closed", "closed")
f
(f<-factor(f))
is.factor(f)



### Section 3.4: Data frames
## Section 3.4.1: Generating data frames in R
rm(list=ls(all=T))
PartOfSpeech<-c("ADJ", "ADV", "N", "CONJ", "PREP")
TokenFrequency<-c(421, 337, 1411, 458, 455)
TypeFrequency<-c(271, 103, 735, 18, 37)
Class<-c("open", "open", "open", "closed", "closed")

x<-data.frame(PartOfSpeech, TokenFrequency, TypeFrequency, Class)

x
str(x)

x$PartOfSpeech

(x.2<-data.frame(TokenFrequency, TypeFrequency, Class, row.names=PartOfSpeech))
str(x.2)



## Section 3.4.2: Loading and saving data frames in R
rm(list=ls(all=T))
x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # no row.names: R numbers rows

x.2<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, row.names=1, sep="\t", comment.char="") # with row.names

write.table(x, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-a.txt"), quote=F, sep="\t", row.names=F)

write.table(x.2, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-b.txt"), quote=F, sep="\t", col.names=NA)



## Section 3.4.3: Accessing and processing (parts of) data frames in R
x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="")

str(x)
x$TokenFrequency
x$Class

attach(x)

Class

(TokenFrequency[4]<-20)

x

TokenFrequency[4]<-458 # change the value back to the old one

x[2,3] # the value of the second row and the third column
x[2,] # all values of the second row (because no column is specified)
x[,3] # all values of the third column (because no row is specified)
x[2:3,4] # two values of the fourth column
x[c(1,3), c(2,4)] # the 1st and 3rd row of the 2nd and 4th column

which(x[,2]>450)
x[,3][which(x[,3]>100)]
x[,3][x[,3]>100]
TypeFrequency[TypeFrequency>100]

(y<-x[which(Class=="open"),]) # or shorter: (y<-x[Class=="open",])

(y<-x[which(x[,4]=="open"),]) # or shorter: (y<-x[x[,4]=="open",])

(y<-subset(x, Class=="open"))

(y<-subset(x, Class=="open" & TokenFrequency<1000))
(y<-subset(x, PartOfSpeech %in% c("ADJ", "ADV")))

(ordering.index<-order(Class, -TokenFrequency))

x[ordering.index,]

x[order(Class, -TokenFrequency),]

no.of.rows<-dim(x)[1] # for the columns: no.of.columns<-dim(x)[2]

ordering.index<-sample(no.of.rows); ordering.index

x[ordering.index,]

x[sample(dim(x)[1]),]

ordering.index<-order(-rank(Class), -rank(PartOfSpeech))
x[ordering.index,]

##################################################################
# You should now do "Exercise Box 3.2: Handling data frames" ... #
##################################################################



### Section 3.5: Lists
rm(list=ls(all=T))
a.vector<-c(1:10) # generates a vector with the numbers from one to ten
a.dataframe<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # load the data frame from section 3.4
another.vector<-c("This", "may", "be", "a", "sentence", "from", "a", "corpus","file", ".")
(a.list<-list(a.vector, a.dataframe, another.vector))

str(a.list)

a.list<-list(Part1=a.vector, Part2=a.dataframe, Part3=another.vector)

names(a.list)<-c("Part1", "Part2", "Part3")

a.list<-list(a.vector, a.dataframe, another.vector) # redefine a.list

a.list[[1]]
a.list[[2]]
a.list[[3]]

is.list(a.list[[1]])
is.vector(a.list[[1]])
is.data.frame(a.list[[2]])
is.vector(a.list[[3]])

a.list[1]
is.list(a.list[1])

a.list[[1]]
a.list[1]

a.list$Part1 # or a.list[["Part1"]]

a.list[c(1,3)]

a.list[[c(1,3)]] # take the third part of the first element of the list

a.list[[1]][3] # take the third part of the first element of the list

a.list[[1]][3:5] # consecutive parts
a.list[[1]][c(3, 5)] # note that "a.list[[1]][3, 5]" does not work

a.list[[2]][3,2]
a.list[[2]][3,2:4]

a.list[[2]][3,c(2, 4)]

x<-a.list[[2]]
(y<-split(x, Class))

y$open

split(x, list(Class, PartOfSpeech))



### Section 3.6: Elementary programming functions
## Section 3.6.1: Conditional expressions
a<-2 # you can of course insert another number here
if (a>2) {
   cat("a is greater than 2.\n")
} else {
   cat("a is not greater than 2.\n")
}

# this is an alternative: note the use of else if, which allows you to get rid of one additional level of embedding
b<-"car" # you can of course insert another word here
if (b=="automobile") {
   cat("b is 'automobile'!\n")
} else if (b=="car") { # !
   cat("b is 'car'!\n")
} else {
   cat("b is neither 'automobile' nor 'car'.\n")
}

#####################################################################
# You should now do "Exercise Box 3.3: Conditional expressions" ... #
#####################################################################



## Section 3.6.2: Loops
for (i in 1:3) {
   cat(i, "\n")
}

j<-seq(2, 1, -0.5)
for (i in j) {
   cat(i, "\n")
}

for (i in 1:2) {
   for (j in 6:7) {
      cat(i, "times", j, "is", i*j, "\n")
   }
}

for (i in 1:3) {
   if (i==2) {
      next
   }
   cat(i, "\n")
}

for (i in 1:5) {
   if (i==3) {
      break
   }
   cat(i, "\n")
}

i<-1
repeat {
   cat(i, "\n")
   i<-i+1 # this way of incrementing a variable is extremely useful!
   if (i==3) { break }
}

i<-1
while (i<3) {
   cat(i, "\n")
   i<-i+1
}

###################################################
# You should now do "Exercise Box 3.4: Loops" ... #
###################################################



## Section 3.6.3: Rules of programming
PartOfSpeech<-c("ADJ", "ADV", "N", "CONJ", "PREP")
TokenFrequency<-c(421, 337, 1411, 458, 455)
TypeFrequency<-c(271, 103, 735, 18, 37)
Class<-c("open", "open", "open", "closed", "closed")

sum.closed<-0; sum.open<-0 # define two vectors for the results
for (i in 1:5) {
   current.class<-Class[i] # access each word class
   if (current.class=="closed") {
      sum.closed<-sum.closed+TokenFrequency[i] # if the current class is "closed", add its token frequency to the first result vector
   } else {
      sum.open<-sum.open+TokenFrequency[i] # if the current class is not "closed", add its token frequency to the other result vector
   } # end of if: test current class
} # end of for: access each word class
sum.closed; sum.open # look at the output

sum(TokenFrequency[which(Class=="closed")])
sum(TokenFrequency[which(Class=="open")])

tapply(TokenFrequency, Class, sum)

####################################################
# You should now do "Exercise Box 3.5: tapply" ... #
####################################################

another.list<-list(c(1), c(2, 3), c(4, 5, 6), c(7, 8, 9, 10))

lengths<-vector()
for (i in 1:length(another.list)) {
   lengths[i]<-length(another.list [[i]])
}
lengths

sapply(another.list, length)

first.elements<-vector()
for (i in 1:length(another.list)) {
   first.elements[i]<-another.list[[i]][1]
}
first.elements

sapply(another.list, "[", 1)

sapply(a.list, "[", 1) # use the list we generated in Section 3.5

a<-c(1, 5, 3); b<-c(2, 6, 4); (ab<-list(a, b))
lapply(ab, sort, decreasing=F) # decreasing=F is passed on to sort



### Section 3.7: Character/string processing
## Section 3.7.1: Getting information from and accessing (vectors of) character strings
example<-c("I", "do", "not", "know")
nchar(example)

substr("internationalization", 6, 13)

substr(example, 2, 3)

some.first.vector<-c("abcd", "efgh")
some.other.vector<-c("ijkl", "mnop")
substr(c(some.first.vector, some.other.vector), c(1, 2, 3, 4), c(2, 3, 4, 4))



## Section 3.7.2: Elementary ways to change (vectors of) character strings
tolower(example)
toupper(example)

chartr("o", "x", example)



## Section 3.7.3: Merging and splitting (vectors of) character strings without regular expressions
paste("I", "do", "not", "know", sep=" ")
paste("I", "do", "not", "know", collapse=" ") # same result
paste("I", "do", "not", "know", sep=" ", collapse=" ") # same result

paste(example, sep=" ")
# with a longer vector, there is a difference
paste(example, sep=" ", collapse=" ") # but sep=" " is the default of paste and can be omitted
paste(example, collapse=" ")

example.2<-"I do not know"
strsplit(example.2, " ")

strsplit(example.2, "")

example.3<-c("This is the first character string", "This is the second character string")
strsplit(example.3, " ")

unlist(strsplit(example.3, " "))



## Section 3.7.4: Searching and replacing without regular expressions
text<-c("This is a first example sentence.", "And this is a second example sentence.")

grep("second", text)

text[grep("second", text)]

grep("second", text, value=T)

grep("eco", text, value=T)

grep("is", text)

regexpr("second", text)

attributes(regexpr("second", text)) # returns a list!

attr(regexpr("second", text), "match.length") # returns a vector!

regexpr("e", text)

gregexpr("e", text)

gregexpr("e", text)[[1]] # and of course the same with [[2]]

unlist(gregexpr("e", text)[1]) # and of course the same with [2]

sapply(gregexpr("e", text), c)

unlist(gregexpr("e", text))

attributes(gregexpr("e", text))
attributes(gregexpr("e", text)[1])

gregexpr("e", text)[[1]]

attributes(gregexpr("e", text)[[1]]) # returns a list, same with [[2]]
unlist(attributes(gregexpr("e", text)[[1]]))

attr(gregexpr("e", text)[[1]], "match.length") # returns a vector, same with [[2]]

sapply(gregexpr("e", text), attributes)

sapply(gregexpr("e", text), attr, "match.length")

unlist(sapply(gregexpr("e", text), attr, "match.length"))

library(gsubfn)
text<-c("This is a first example sentence.", "And this is a second example sentence.")
strapply(text, "first")
strapply(text, "is")

gsub("a", "the", text)

gsub("a", "the", text)

gsub(" a ", " the ", text)

text

(text.2<-gsub(" a ", " the ", text))



## Section 3.7.5: Searching and replacing with regular expressions
grep("^t", text, ignore.case=T, perl=T, value=T)

grep("s.c", text, ignore.case=T, perl=T, value=T)

grep("f...t", text, ignore.case=T, perl=T, value=T)

gsub("\\.", "!", text, perl=T)

colors<-c("color", "colour")
grep("colou?r", colors, perl=T, value=T)

some.text.line<-"This  is    just one    example."
gsub(" +", " ", some.text.line, perl=T)

grep("colou{0,1}r", colors, perl=T, value=T)

some.vector<-c("a st b", "a stst b", "a st st b")
grep("(st){2,3}", some.vector, perl=T, value=T)

grep("(.t){2,3}", some.vector, perl=T, value=T)

gsub("a (first|second)", "another", text, ignore.case=T, perl=T)

gsub("(a|e|i|o|u)", "V", text, ignore.case=T, perl=T)

gsub("[aeiou]", "V", text, ignore.case=T, perl=T)

gsub("[a-ht-z]", "X", text, ignore.case=T, perl=T)

gsub("[1-5]", "3", "0123456789", perl=T)

#######################################################################
# You should now do "Exercise Box 3.6: a few regular expressions" ... #
#######################################################################

gsub("[^1-5]", "3", "0123456789", perl=T)

gsub("\\d", "3", "a1b2c3d4e5", perl=T)

gsub("\\D", "3", "a1b2c3d4e5", perl=T)

gsub("\\w+", "WRD", text, perl=T)

gsub("\\w\\W", "<WB>", text, perl=T)
gsub("\\W\\w", "<WB>", text, perl=T)
gsub("\\b", "<WB>", text, perl=T)

(the.match<-gregexpr("s.*s", text[1], perl=T))

substr(text[1], unlist(the.match), unlist(the.match)+attr(the.match[[1]], "match.length")-1)

some.corpus.line<-"he said, you are lazy and you are stupid."
gregexpr("you.*are", some.corpus.line, perl=T)

gregexpr("(?U)you.*are", some.corpus.line, perl=T)

gregexpr("you.*?are", some.corpus.line, perl=T)

gregexpr("s.*?s", text[1], perl=T)

(text.2<-gsub("\\w+", "<w>", text, perl=T))

(text.2<-gsub("\\b", "<w>", text, perl=T))

(text.2<-gsub("(\\w+)", "\\1<w>", text, perl=T))

(text.3<-gsub("([!:,\\.\\?])", "\\1<p>", text.2, perl=T))

American.dates<-c("5/15/1976", "2.15.1970", "1.9.2006")

(British.dates<-sub("(\\d{1,2})\\D(\\d{1,2})\\D", "\\2/\\1/", American.dates, perl=T))

(British.dates<-sub("(\\d{1,2})(\\D)(\\d{1,2})(\\D)", "\\3\\4\\1\\2", American.dates, perl=T))

gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", text, perl=T)

gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", "This not is my dog", perl=T)

gsub("(\\w+?)(\\W+\\w*?)\\1\\b", "\\1<r>\\2\\1<r>", text, perl=T)

example<-c("abcd", "abcde", "abcdf")

grep("abc(?=de)", example, perl=T)

grep("abcde", example, perl=T)

gsub("abcde", "xyz", example, perl=T)

gsub("abc(?=de)", "xyz", example, perl=T)

gsub("(.)(?=e)", "\\1\\1", example, perl=T)

text

gregexpr("s.*?(?=s)", text[1], perl=T)

gsub("d(?!e)", "D", example, perl=T)

gsub("d[^e]", "D", example, perl=T)

example1<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <ptr target=KB0LC004></u>"

gsub("<[^wc][^ ].*?>", "", example1, perl=T)

example2<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <wtr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example2, perl=T)

example3<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <p tr target=KB0LC004></u>"
gsub("<[^wc][^ ].*?>", "", example3, perl=T)

gsub("<(?![wc] ...(-...)?>).*?>", "", example1, perl=T)

gsub("<(?![wc] ...(-...)?>).*?>[^<]*", "", example1, perl=T)

text<-c("This is a first example sentence.", "And this is a second example sentence.")

strapply(text[1], "first")

strapply(text, "[a-z]*i[a-z]*", ignore.case=T, perl=T)

strapply(text, "[a-z]*i[a-z]*", nchar, ignore.case=T, perl=T)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=-1)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=1)

############################################################################
# You should now do "Exercise Box 3.7: a few more regular expressions" ... #
############################################################################



## Section 3.7.6: Merging and splitting (vectors of) character strings with regular expressions
tagtext<-"<w DPS>my <w NN1>mum <w CJC>and <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<w ...>", perl=T))

tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] ...>", perl=T))

tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] (...|...-...)>", perl=T))

unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))

tagtext<-"<w DPS>my <w NN1>service <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))

table(unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T)))

unlist(strsplit(tagtext, " *<[wc] .{3,7}> *", perl=T))

unlist(strapply(tagtext, "<[^<]*", perl=T))



### Section 3.8: File and directory operations
getwd()

setwd("C:/_qclwr") # if you don't get a response, it's done

dir("C:/_qclwr/BNCwe", pattern="^D")

basename("C:/_qclwr/test.txt")

file.info(dir())

# to plot into a file
png(file="C:/_qclwr/_outputfiles/03-8_graph.png", width=600, height=600)
   plot(1:10, 1:10, type="b")
dev.off()


トップ   新規 一覧 単語検索 最終更新   ヘルプ   最終更新のRSS