ScriptChap3 のバックアップ(No.1) - 投野由紀夫授業用ホームページ

バックアップ一覧
ソースを表示
ScriptChap3 は削除されています。
- 1 (2009-10-28 (水) 21:27:33)

第３章のスクリプト

#### Chapter 3: A brief introduction to R

# Note: The lines are not preceded by "> " here since

# (i) the contrast between what you enter and between what R outputs does not have to be highlighted - below you will only find lines you will enter, and

# (ii) I would like you to be able to copy and paste directly without having to delete line-initial "> " all the time.

mean(c(1, 2, 3))

2- 3

library(corpora )

### Section 3.1: A few central notions: data structures, functions, and arguments 2+2 3^2 (2+3)^2

sqrt(5)

log(150, 10)

log(x=150, base=10) # with argument labels

Log(150, 10) # R does not know this function - it only knows "log"

aa<-sqrt(5) # compute the square root of 5

ls()

(aa<-sqrt(5))

(aa<-aa+2)

sqrt(9); sqrt(16)

rm(aa) # delete aa

rm(list=ls(all=TRUE)) # delete all data structures

x<-c(1:10)

sample(x, size=5, replace=T, prob=NULL)

sample(x, 5, T, NULL)

sample(x, 5, T)

sample(x, 5, F)

sample(x, 5)

x sample(x)

sample(10)

q()

### Section 3.2: Vectors

## Section 3.2.1: Basics sqrt(5)

aa<-sqrt(5) # compute the square root of 5 is.vector(aa) class(aa)

length(aa)

(empty<-vector(length=3)) # create an 'empty' vector of a user-defined length

(a.name<-"James") class(a.name)

length(a.name)

(numbers<-c(1, 2, 3)) (names<-c("James", "Jonathan", "Jean-Luc"))

numbers1<-c(1, 2, 3); numbers2<-c(4, 5, 6) # generate two vectors (numbers1.and.numbers2<-c(numbers1, numbers2)) # join the two vectors (numbers1.and.numbers2<-append(numbers1, numbers2)) # another way to join the two vectors

numbers1+numbers2 # that is, 1+4, 2+5, 3+6

bb<-10 numbers1*bb

bb<-c(10, 20) numbers1*bb

names(numbers)<-c("first", "second", "third"); numbers

(mixture<-c(1, 2, "Benjamin"))

str(numbers1) str(mixture)

## Section 3.2.2: Loading vectors x<-scan(file="C:/_qclwr/_inputfiles/dat_vector-a.txt", sep="\n")

x.1<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char") x.2<-scan(file="C:/_qclwr/_inputfiles/dat_vector-b.txt", what="char", sep="\n")

x.1 x.2

filename<-select.list(dir(scan(nmax=1, what="char")), multiple=T)

x.1<-scan(file.choose(), what="char") x.2<-scan(file.choose(), what="char", sep="\n")

x<-scan() 1 2 3

## Section 3.2.3: Accessing and processing (parts of) vectors min(c(1, 2, 3)); max(c(1, 2, 3))

x<-c("a", "b", "c", "d", "e") x[3] # access the 3rd element of x

y<-3 x[y] # access the 3rd element of x

z<-c(1, 3) x[z] # access the 1st and the 3rd element of x just as x[c(1, 3)] would do

z<-c(1:3) x[z] # access the elements 1 to 3 of x

x[-2] # access x but without its 2nd element

x=="d"

(x<-c(10:1)) # generate and output a vector with the numbers from 10 to 1 x==4 # which elements of x are 4? x<=7 # which elements of x are smaller than or equal to 7? x!=8 # which elements of x are not 8? (x>8 | x<3) # which elements of x are larger than 8 or smaller than 3?

which(x==4) # which elements of x are 4?

which(x<=7) # which elements of x are less than or equal to 7? which(x!=8) # which elements of x are not 8? which(x>8 | x<3) # which elements of x are greater than 8 or less than 3?

(pointer<-which(x>8 | x<3)) (y<-x[pointer])

x[which(x>8 | x<3)] # output the elements of x which are greater than 8 or smaller than 3

x[x>8 | x<3] # output the elements of x which are greater than 8 or smaller than 3

length(which(x>8 | x<3)) # output the number of elements of x which are greater than 8 or smaller than 3

sum(x>8 | x<3) # output the number of elements of x which are greater than 8 or smaller than 3

x # output x y<-which(x>8) # store the positions of elements greater than 9 in the vector y x[y]<-12; x # replace the elements of x that are greater than 8 by 12

x<-c(10:1) # generate a vector with the numbers from 10 to 1 again x[which(x>8)]<-12; x # change the element(s) in x which are greater than 8 to 12

x<-c(10:1) # generate a vector with the numbers from 10 to 1 again x[x>8]<-12; x

x<-c(10:1); y<-c(2, 5, 9) # generate vectors again

x %in% y y %in% x

x[x %in% y]

match(x, y)

match(y, x)

setdiff(x, y) setdiff(y, x)

intersect(x, y) intersect(y, x)

union(x, y) union(y, x)

g<-c(1, 2, 3, 2, 3, 4, 3, 4, 5) h<-c(2, 3, 1, 5, 2, 6, 3, 1, 2) unique(g)

table(g)

table(g, h)

h sort(h, decreasing=T)

z<-c(3, 5, 10, 1, 6, 7, 8, 2, 4, 9) order(z, decreasing=F)

##############################################################

# You should now do "Exercise Box 3.1: Handling vectors" ... #

##############################################################

### Section 3.3: Factors f<-c("open", "open", "open", "closed", "closed") f (f<-factor(f)) is.factor(f)

### Section 3.4: Data frames

## Section 3.4.1: Generating data frames in R rm(list=ls(all=T)) PartOfSpeech?<-c("ADJ", "ADV", "N", "CONJ", "PREP") TokenFrequency?<-c(421, 337, 1411, 458, 455) TypeFrequency?<-c(271, 103, 735, 18, 37) Class<-c("open", "open", "open", "closed", "closed")

x<-data.frame(PartOfSpeech?, TokenFrequency?, TypeFrequency?, Class)

x str(x)

x$PartOfSpeech?

(x.2<-data.frame(TokenFrequency?, TypeFrequency?, Class, row.names=PartOfSpeech?)) str(x.2)

## Section 3.4.2: Loading and saving data frames in R rm(list=ls(all=T)) x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # no row.names: R numbers rows

x.2<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, row.names=1, sep="\t", comment.char="") # with row.names

write.table(x, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-a.txt"), quote=F, sep="\t", row.names=F)

write.table(x.2, choose.files(default="C:/_qclwr/_outputfiles/03-4-2_dataframe-b.txt"), quote=F, sep="\t", col.names=NA)

## Section 3.4.3: Accessing and processing (parts of) data frames in R x<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="")

str(x) x$TokenFrequency? x$Class

attach(x)

Class

(TokenFrequency?[4]<-20)

TokenFrequency?[4]<-458 # change the value back to the old one

x[2,3] # the value of the second row and the third column x[2,] # all values of the second row (because no column is specified) x[,3] # all values of the third column (because no row is specified) x[2:3,4] # two values of the fourth column x[c(1,3), c(2,4)] # the 1st and 3rd row of the 2nd and 4th column

which(x[,2]>450) x[,3][which(x[,3]>100)] x[,3][x[,3]>100] TypeFrequency?[TypeFrequency?>100]

(y<-x[which(Class=="open"),]) # or shorter: (y<-x[Class=="open",])

(y<-x[which(x[,4]=="open"),]) # or shorter: (y<-x[x[,4]=="open",])

(y<-subset(x, Class=="open"))

(y<-subset(x, Class=="open" & TokenFrequency?<1000)) (y<-subset(x, PartOfSpeech? %in% c("ADJ", "ADV")))

(ordering.index<-order(Class, -TokenFrequency?))

x[ordering.index,]

x[order(Class, -TokenFrequency?),]

no.of.rows<-dim(x)[1] # for the columns: no.of.columns<-dim(x)[2]

ordering.index<-sample(no.of.rows); ordering.index

x[ordering.index,]

x[sample(dim(x)[1]),]

ordering.index<-order(-rank(Class), -rank(PartOfSpeech?)) x[ordering.index,]

##################################################################

# You should now do "Exercise Box 3.2: Handling data frames" ... #

##################################################################

### Section 3.5: Lists rm(list=ls(all=T)) a.vector<-c(1:10) # generates a vector with the numbers from one to ten a.dataframe<-read.table(choose.files(default="C:/_qclwr/_inputfiles/dat_dataframe-a.txt"), header=T, sep="\t", comment.char="") # load the data frame from section 3.4 another.vector<-c("This", "may", "be", "a", "sentence", "from", "a", "corpus","file", ".") (a.list<-list(a.vector, a.dataframe, another.vector))

str(a.list)

a.list<-list(Part1=a.vector, Part2=a.dataframe, Part3=another.vector)

names(a.list)<-c("Part1", "Part2", "Part3")

a.list<-list(a.vector, a.dataframe, another.vector) # redefine a.list

a.list1? a.list2? a.list3?

is.list(a.list1?) is.vector(a.list1?) is.data.frame(a.list2?) is.vector(a.list3?)

a.list[1] is.list(a.list[1])

a.list1? a.list[1]

a.list$Part1 # or a.list[["Part1"]]

a.list[c(1,3)]

a.listc(1,3)? # take the third part of the first element of the list

a.list1?[3] # take the third part of the first element of the list

a.list1?[3:5] # consecutive parts a.list1?[c(3, 5)] # note that "a.list1?[3, 5]" does not work

a.list2?[3,2] a.list2?[3,2:4]

a.list2?[3,c(2, 4)]

x<-a.list2? (y<-split(x, Class))

y$open

split(x, list(Class, PartOfSpeech?))

### Section 3.6: Elementary programming functions

## Section 3.6.1: Conditional expressions a<-2 # you can of course insert another number here if (a>2) {

  cat("a is greater than 2.\n")

} else {

  cat("a is not greater than 2.\n")

}

# this is an alternative: note the use of else if, which allows you to get rid of one additional level of embedding b<-"car" # you can of course insert another word here if (b=="automobile") {

  cat("b is 'automobile'!\n")

} else if (b=="car") { # !

  cat("b is 'car'!\n")

} else {

  cat("b is neither 'automobile' nor 'car'.\n")

}

#####################################################################

# You should now do "Exercise Box 3.3: Conditional expressions" ... #

#####################################################################

## Section 3.6.2: Loops for (i in 1:3) {

  cat(i, "\n")

}

j<-seq(2, 1, -0.5) for (i in j) {

  cat(i, "\n")

}

for (i in 1:2) {

  for (j in 6:7) {
     cat(i, "times", j, "is", i*j, "\n")
  }

}

for (i in 1:3) {

  if (i==2) {
     next
  }
  cat(i, "\n")

}

for (i in 1:5) {

  if (i==3) {
     break
  }
  cat(i, "\n")

}

i<-1 repeat {

  cat(i, "\n")
  i<-i+1 # this way of incrementing a variable is extremely useful!
  if (i==3) { break }

}

i<-1 while (i<3) {

  cat(i, "\n")
  i<-i+1

}

###################################################

# You should now do "Exercise Box 3.4: Loops" ... #

###################################################

## Section 3.6.3: Rules of programming PartOfSpeech?<-c("ADJ", "ADV", "N", "CONJ", "PREP") TokenFrequency?<-c(421, 337, 1411, 458, 455) TypeFrequency?<-c(271, 103, 735, 18, 37) Class<-c("open", "open", "open", "closed", "closed")

sum.closed<-0; sum.open<-0 # define two vectors for the results for (i in 1:5) {

  current.class<-Class[i] # access each word class
  if (current.class=="closed") {
     sum.closed<-sum.closed+TokenFrequency[i] # if the current class is "closed", add its token frequency to the first result vector
  } else {
     sum.open<-sum.open+TokenFrequency[i] # if the current class is not "closed", add its token frequency to the other result vector
  } # end of if: test current class

} # end of for: access each word class sum.closed; sum.open # look at the output

sum(TokenFrequency?[which(Class=="closed")]) sum(TokenFrequency?[which(Class=="open")])

tapply(TokenFrequency?, Class, sum)

####################################################

# You should now do "Exercise Box 3.5: tapply" ... #

####################################################

another.list<-list(c(1), c(2, 3), c(4, 5, 6), c(7, 8, 9, 10))

lengths<-vector() for (i in 1:length(another.list)) {

  lengths[i]<-length(another.list [[i]])

} lengths

sapply(another.list, length)

first.elements<-vector() for (i in 1:length(another.list)) {

  first.elements[i]<-another.list[[i]][1]

} first.elements

sapply(another.list, "[", 1)

sapply(a.list, "[", 1) # use the list we generated in Section 3.5

a<-c(1, 5, 3); b<-c(2, 6, 4); (ab<-list(a, b)) lapply(ab, sort, decreasing=F) # decreasing=F is passed on to sort

### Section 3.7: Character/string processing

## Section 3.7.1: Getting information from and accessing (vectors of) character strings example<-c("I", "do", "not", "know") nchar(example)

substr("internationalization", 6, 13)

substr(example, 2, 3)

some.first.vector<-c("abcd", "efgh") some.other.vector<-c("ijkl", "mnop") substr(c(some.first.vector, some.other.vector), c(1, 2, 3, 4), c(2, 3, 4, 4))

## Section 3.7.2: Elementary ways to change (vectors of) character strings tolower(example) toupper(example)

chartr("o", "x", example)

## Section 3.7.3: Merging and splitting (vectors of) character strings without regular expressions paste("I", "do", "not", "know", sep=" ") paste("I", "do", "not", "know", collapse=" ") # same result paste("I", "do", "not", "know", sep=" ", collapse=" ") # same result

paste(example, sep=" ")

# with a longer vector, there is a difference paste(example, sep=" ", collapse=" ") # but sep=" " is the default of paste and can be omitted paste(example, collapse=" ")

example.2<-"I do not know" strsplit(example.2, " ")

strsplit(example.2, "")

example.3<-c("This is the first character string", "This is the second character string") strsplit(example.3, " ")

unlist(strsplit(example.3, " "))

## Section 3.7.4: Searching and replacing without regular expressions text<-c("This is a first example sentence.", "And this is a second example sentence.")

grep("second", text)

text[grep("second", text)]

grep("second", text, value=T)

grep("eco", text, value=T)

grep("is", text)

regexpr("second", text)

attributes(regexpr("second", text)) # returns a list!

attr(regexpr("second", text), "match.length") # returns a vector!

regexpr("e", text)

gregexpr("e", text)

gregexpr("e", text)1? # and of course the same with 2?

unlist(gregexpr("e", text)[1]) # and of course the same with [2]

sapply(gregexpr("e", text), c)

unlist(gregexpr("e", text))

attributes(gregexpr("e", text)) attributes(gregexpr("e", text)[1])

gregexpr("e", text)1?

attributes(gregexpr("e", text)1?) # returns a list, same with 2? unlist(attributes(gregexpr("e", text)1?))

attr(gregexpr("e", text)1?, "match.length") # returns a vector, same with 2?

sapply(gregexpr("e", text), attributes)

sapply(gregexpr("e", text), attr, "match.length")

unlist(sapply(gregexpr("e", text), attr, "match.length"))

library(gsubfn) text<-c("This is a first example sentence.", "And this is a second example sentence.") strapply(text, "first") strapply(text, "is")

gsub("a", "the", text)

gsub(" a ", " the ", text)

text

(text.2<-gsub(" a ", " the ", text))

## Section 3.7.5: Searching and replacing with regular expressions grep("^t", text, ignore.case=T, perl=T, value=T)

grep("s.c", text, ignore.case=T, perl=T, value=T)

grep("f...t", text, ignore.case=T, perl=T, value=T)

gsub("\\.", "!", text, perl=T)

colors<-c("color", "colour") grep("colou?r", colors, perl=T, value=T)

some.text.line<-"This is just one example." gsub(" +", " ", some.text.line, perl=T)

grep("colou{0,1}r", colors, perl=T, value=T)

some.vector<-c("a st b", "a stst b", "a st st b") grep("(st){2,3}", some.vector, perl=T, value=T)

grep("(.t){2,3}", some.vector, perl=T, value=T)

gsub("a (first|second)", "another", text, ignore.case=T, perl=T)

gsub("(a|e|i|o|u)", "V", text, ignore.case=T, perl=T)

gsub("[aeiou]", "V", text, ignore.case=T, perl=T)

gsub("[a-ht-z]", "X", text, ignore.case=T, perl=T)

gsub("[1-5]", "3", "0123456789", perl=T)

#######################################################################

# You should now do "Exercise Box 3.6: a few regular expressions" ... #

#######################################################################

gsub("[^1-5]", "3", "0123456789", perl=T)

gsub("\\d", "3", "a1b2c3d4e5", perl=T)

gsub("\\D", "3", "a1b2c3d4e5", perl=T)

gsub("\\w+", "WRD", text, perl=T)

gsub("\\w\\W", "<WB>", text, perl=T) gsub("\\W\\w", "<WB>", text, perl=T) gsub("\\b", "<WB>", text, perl=T)

(the.match<-gregexpr("s.*s", text[1], perl=T))

substr(text[1], unlist(the.match), unlist(the.match)+attr(the.match1?, "match.length")-1)

some.corpus.line<-"he said, you are lazy and you are stupid." gregexpr("you.*are", some.corpus.line, perl=T)

gregexpr("(?U)you.*are", some.corpus.line, perl=T)

gregexpr("you.*?are", some.corpus.line, perl=T)

gregexpr("s.*?s", text[1], perl=T)

(text.2<-gsub("\\w+", "<w>", text, perl=T))

(text.2<-gsub("\\b", "<w>", text, perl=T))

(text.2<-gsub("(\\w+)", "\\1<w>", text, perl=T))

(text.3<-gsub("([!:,\\.\\?])", "\\1", text.2, perl=T))

American.dates<-c("5/15/1976", "2.15.1970", "1.9.2006")

(British.dates<-sub("(\\d{1,2})\\D(\\d{1,2})\\D", "\\2/\\1/", American.dates, perl=T))

(British.dates<-sub("(\\d{1,2})(\\D)(\\d{1,2})(\\D)", "\\3\\4\\1\\2", American.dates, perl=T))

gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", text, perl=T)

gsub("(\\w+?)(\\W+\\w*?)\\1(\\W)", "\\1<r>\\2\\1<r>\\3", "This not is my dog", perl=T)

gsub("(\\w+?)(\\W+\\w*?)\\1\\b", "\\1<r>\\2\\1<r>", text, perl=T)

example<-c("abcd", "abcde", "abcdf")

grep("abc(?=de)", example, perl=T)

grep("abcde", example, perl=T)

gsub("abcde", "xyz", example, perl=T)

gsub("abc(?=de)", "xyz", example, perl=T)

gsub("(.)(?=e)", "\\1\\1", example, perl=T)

text

gregexpr("s.*?(?=s)", text[1], perl=T)

gsub("d(?!e)", "D", example, perl=T)

gsub("d[^e]", "D", example, perl=T)

example1<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <ptr target=KB0LC004>"

gsub("<[^wc][^ ].*?>", "", example1, perl=T)

example2<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know <wtr target=KB0LC004>" gsub("<[^wc][^ ].*?>", "", example2, perl=T)

example3<-"<w UNC>er<c PUN>, <w AV0>anyway <w PNP>we<w VBB>'re <w AJ0>alright <w AV0>now <ptr target=KB0LC003><w AV0>so<c PUN>, <w PNP>you <w VVB>know " gsub("<[^wc][^ ].*?>", "", example3, perl=T)

gsub("<(?![wc] ...(-...)?>).*?>", "", example1, perl=T)

gsub("<(?![wc] ...(-...)?>).*?>[^<]*", "", example1, perl=T)

text<-c("This is a first example sentence.", "And this is a second example sentence.")

strapply(text[1], "first")

strapply(text, "[a-z]*i[a-z]*", ignore.case=T, perl=T)

strapply(text, "[a-z]*i[a-z]*", nchar, ignore.case=T, perl=T)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=-1)

strapply(text, "([a-z]*)i([a-z]*)", c, ignore.case=T, perl=T, backref=1)

############################################################################

# You should now do "Exercise Box 3.7: a few more regular expressions" ... #

############################################################################

## Section 3.7.6: Merging and splitting (vectors of) character strings with regular expressions tagtext<-"<w DPS>my <w NN1>mum <w CJC>and <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<w ...>", perl=T))

tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] ...>", perl=T))

tagtext<-"<w DPS>my <w NN1>mum <c PUN>, <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] (...|...-...)>", perl=T))

unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))

tagtext<-"<w DPS>my <w NN1>service <w DPS>my <w AJ0-VVN>beloved <w NN1>aunt <w VVD>went <w PRP>into <w NN1>service"

unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T))

table(unlist(strsplit(tagtext, "<[wc] .{3,7}>", perl=T)))

unlist(strsplit(tagtext, " *<[wc] .{3,7}> *", perl=T))

unlist(strapply(tagtext, "<[^<]*", perl=T))

### Section 3.8: File and directory operations getwd()

setwd("C:/_qclwr") # if you don't get a response, it's done

dir("C:/_qclwr/BNCwe", pattern="^D")

basename("C:/_qclwr/test.txt")

file.info(dir())

# to plot into a file png(file="C:/_qclwr/_outputfiles/03-8_graph.png", width=600, height=600)

  plot(1:10, 1:10, type="b")

dev.off()