1.Tranforming variables

R:

    setwd("c:/myRfolder") 
    load(file = "mydata.RData")

a. Transformation in the middle of another function

summary(log(mydata$q4)

b. Creating meanQ with dollar notation

mydata$meanQ <- (mydata$q1 +mydata$q2+mydata$q3+mydata$q4) /4

c. Creating two variables using transfrom

mydata <-- transform(mydata, score1 = (q1+q2)/2, score2 = (q3+q4)/2)

d. Creating meanQ using index notation on the left

load(file = "mydata.RData")
mydata <- data.frame(cbind(mydata, mean   q =0.))
mydata[7] <- (mydata$q1 +mydata$q2 +mydata$q3    +mydata$q4)/4

PYTHON: page 706

SAS:

    LIBNAME mylib 'C:\myRfolder';
      data mylib.mydataTransformed;
        set mylib.mydata;
        totalq = (q1+q2+q3+q4);
        logtot = log10(totalq);
        mean1 = (q1+q2+q3+q4)/4;
        mean2 = mean(of q1 - q4)

2. Procedures or functions

R

a. Mean of the q variables

 `mean(mydata[3:6], na.rm = TURE)`

b. Create mymatrix

 `mymatrix <- as.matrix(mydata[ , 3:6])`

c. Get mean of whole matrix

 `mean(mymatrix, na.rm = TRUE)`

d. get mean of matrix columns.

 `apply(mymatrix, 2,mean, na.rm= TRUE)`

e. get mean of matrix rows

     apply(mymatrix, 1, mean, na.rm = TRUE)
     rowMeans (mymatrix, na.rm = TRUE)

f. add row means to mydata

     mydata$meanQ <- apply(mymatrix, 1, mean, na.rm = TRUE) 
     mydata$meanQ <- rowMeans (mymatrix, na.rm = TRUE)
     mydata <- transform(mydata, meanQ =  rowMeans(mymatrix, na.rm = TRUE))

g. Means of data frames & their vectors

     lapply(mydata [, 3:6], mean, na.rm = TRUE)
     sapply(mydata [, 3:6], mean, na.rm = TRUE)
     mean(
     sapply(mydata [, 3:6], mean, na.rm = TURE)
     )

h. Length of data frames & their vectors

    length(mydata[, "q3"])
    nrow(mydata)
    is.na(mydata[, "a3"])
    !is.na(mydata [ , "q3"])
    sum(!is.na(mydata [, "q3"]))

PYTHON:

SAS:

data mylib.mydata;

set mylib.mydata;

myMean = MEAN(OF q1-q4);

myN = N(OF q1- q4);

run;

proc means ;

var q1 - q4 myMean myN;

run;

3. Finding N or NVALID

R:

library("prettyR")

sapply(mydata, valid.n)

apply(myMatrix, 1, valid.n)

mydata$myQn <- apply(myMatrix,1, valid.n)

4. Standardizing and ranking variables

R:

myZs <- apply(mymatrix, 2,scale)

myRanks <- apply(mymatrix, 2, rank)

PYTHON:

The rank method produces a data ranking with ties being assigned the mean of the ranks (by default) for the group

SAS:

     proc standard data = mylib.mydata;
       mean = 0 std = 1 out = myzs;
     run;

     proc rank data = mylib.mydata out = myranks;
     run;

5. applying your own functions

apply(mymatrix, 2, mean, sd) # No good

    mystats <function(x) {
     c(mean= mean(x, na.rm = TRUE),
       sd = sd(x, na.rm = TRUE))
       }
    apply(mymatrix, 2, mystats)
    apply(mymatrix, 2, function(x) {
        c(mean = mean(x, na.rm = TRUE), 
         sd = sd(x, na.rm = TRUE))
         })

6. Conditional transformations

R

setwd("c:/myRfolder")

load(file = "mydata.RData")

mydata$q4Sagree <- ifelse(q4 ==5, 1,0)

mydata$q4agree <- as.numeric(q4 ==5)

mydata$q4agree <- ifelse(q4 >= 4, 1,0)

mydata$ws1agree <- ifelse(workshop = 1& q4 >= 4, 1,0)

mydata$score <- ifelse(gender =="f", (2*q1) +q2, (3*q1) +q2)

PYTHON:

SAS:

      LIBNAME mylib 'C:\myRfolder';
      data mylib.mydataTransformed;
        set mylib.mydata;
        if q4 = 5 then x1 =1 ; else x1 = 0;
        if q4 >= 4 then x2 =1; else x2 = 0;
        if workshop = 1 & q4 >= 5 then x3 = 1;
          else x3 = 0;
          if gender = "f" then scoreA = 2*q1 +q2;
                               else scoreA  = 3* q1 +q2;
          if workshop = 1 and q4 >= 5 
            then scoreB = 2*q1 +a2;
            else scoreB = 3*q1 +q2;
         run;

7. Cutting functions

R:

attach(mydata100)

a. an inefficient approach

     postgroup <- posttest 
     postgroup <- ifelse(posttest <60           , 1, postgroup)
     postgroup <- ifelse(posttest >= 60 & posttest <70, 2, postgroup)
     postgroup <- ifelse(posttest >= 70 & posttest <80, 3, postgroup)
     postgroup <- ifelse(posttest >= 80 & posttest <90, 4, postgroup)
     postgroup <- ifelse(posttest >= 90,                5, postgroup)

b. an efficient approach

    postgroup <- 
     ifelse(posttest <60                 , 1,
       ifelse(posttest>= 60 & posttest <70, 2,
          ifelse(posttest >= 70 & posttest <80, 3, 
          ifelse(posttest >= 80 & posttest <90, 4,
             ifelse (posttest >= 90            , 5, posttest)
     ))))
    table(postgroup)

c. Logical approach

   postgroup <- 1 +
      (posttest >= 60) +
      (posttest >= 70) +
      (posttest >= 80) +
      (posttest >= 90)
      table(postgroup)
library("Hmisc")
postgroup <- cut2(posttest, c(60,70, 80, 90))
table(postgroup)
postgroup<- cut2(posttest, g = 5)
postgroup <- cut2(postest, m =25)

PYTHON:

SAS:

    data mylib.mydataTransformed;
    set mylib.mydata100;
    if (posttest <60) then postGroup = 1;
    else if(posttest >= 60 & posttest <70) then postgroup = 2;
    else if(posttest >= 70 & posttest <80) then postgroup = 3;
    else if (posttest >= 80 & posttest <90) then postgroup = 4;
    else if (posttest >= 90) then postegroup = 5;
    run;

    proc freq;
      tables postgroup;
    run;

    proc rank out = mylib.mydataTransformed Groups = 5;
      var posttest;
    run;

8. Multiple conditional transformation

R:

a. Using the ifelse approach

mydata$score1 <- ifelse(gender == "f", (2q1) +q2, #score1 for females; (20q1+q2) # score1 for males )

mydata$score2 <- ifelse(gender =="f", (3q1+q2), # score2 for females (30q1 +q2) # score 2 for males )

b. Using the index approach

load(file = "mydata.Rdata")

create names in data frame

mydata <- data.frame(mydata, score1 = NA, score2 = NA)

attach(mydata)

\# find which are males and females

gals <- which(gender == "f")

guys <- which(gender =="m")

mydata[gals, "socre1"] <- 2*q1[gals] +q2[gals]

mydata[gals, "score2"] <- 3 *q1[gals] +q2[gals]

mydata[guys, "score1"] <- 20* q1[guys] + q2 *[guys]

mydata[guys, "score2"] <- 30 * q1[guys] +q2[guys]

\# clean up

rm(guys, gals)

PYTHON:

SAS:

    data mylib.mydata;
    set mylib.mydata;
    if gender = "f" then do;
    score1 = (2*q1) +q2;
    score2 = (3*q1) +q2;
    end;
    else if gender = "m" then do;
    score1 = (20*q1) +q2;
    score2 = (30*q1)+q2;
    end;
    run;

9. Missing values

When importing numeric data, R reads blanks as missing(except when blanks are delimiters). R reads the string NA as missing for both numeric and character variables. when importing a text file, both SAS and SPSS would recognize a period as a missing value for numeric variables. R will instead read the whole variable as a character vector!

SAS

    data mylib.mydata;
     set mylib.mydata;
       if q1= 9 then q1 = .;
       if q2 = 9 then q2 = .;
       if q3 = 99 then q3 = .;
       if q4 = 99 then q4 = .;
    \# same thing but is quicker for lots of vars
       array q9 q1 -q2;
         do over q9;
           if q9 = 9 then q = .;
        end;
        array q99 q3 - q4;
          do over q99;
            if q = 99 then q99 = .;
        end;

PYTHON:

pandas primarily uses the value np.nan to represent missing data. it is by default not included in computations.

#1. To make detecting missing values easier (and across different array dtypes), pandas provides the isnull() and notnull() functions, which are also methods on series and dataframe objects:

pd.isnull(df2['one'])

df2['four'].notnull()

df.isnull()

#2. Datetimes

For datetime64 types, NaT represents missing values

#3. Inserting missing data

Your can insert missing values by simply assigning to containers. the actural missing value used will be chosen based on the type. for example, numeric containers will always use NaN regardless of the missing value type chosen:

s = pd.series{[1,2,3]}

s.log[0] = None

s = pd.series{['a', 'b', 'c']}

s.log[1] = np.nan

#4. calculation with missing data

Missing values propagate naturally through arithmetic operations between pandas objects

The descriptive statistics and computational methods are all written to account for missing data. for example,

when summing data, NA(missing) values will be treated as zero

if the data are all NA, the result will be NA

Methods like cumsum and cumprod ignore NA values, but preserve them in the resulting arrays

df['one'].sum

df.mean()

#5. Cleaning/filling missing data

pandas objects are equipped with various data manipulation methods for dealing with missing data.

Filling missing values : fillna

the fillna function can "fillin" NA values with non-null data in a couple of ways, which we illustrate:

Replace NA with a scalar value

df2.fillna (0)

df2['four'].fillna('missing')

fill gaps forward or backward:

df.fillna (method = 'pad')

To remind you, there are the available filling methods:

Method Action
pad/ffill Fill values forward
bfill/backfill Fill values backward

With time series data, using pad/ffill is extremely common so that the "last known value" is available at every time point.

The ffill() function is equvalent to fillna(method = 'ffill') and bfill() is equivalent to fillna(method = 'bfill')

Filling with a Pandasobject

you can also fillna using a dict or series that is alignable. the labels of the dict or index of the series must match the columns of the frame you wish to fill. the use case of this is to fill a dataframe with the mean of that column .

diff.fillna(dff.mean())

dff.fillna(dff.mean()['B','C'])

dff.where(pd.notnull(dff), dff.mean(), axis = 'column')

#6. Dropping axis labels with missing data: dropna

you may wish to simply exclude labels from a data set which refer to missing data. to do this, use the dropna method:

df.dropna(axis = 0)

dr.dropna(axis = 1)

df['one'].dropna()

Series.dropna is a simpler method as it only has one axis to consider, dataframe.dropna has considerably more options than series.dropna.

#7. String /Regular expression replacement

Replace the '.' with nan

d = {'a': list(4)), 'b': list('ab..'), 'c':['a','b', np.nan,'d'])

df = pd.DataFrame(d)

df.replace('.', np.nan)

df.replace(r'\s+.\s*',np.nan, regex = True) # replace and removes surrounding whitespace

df.replace(['a', '.'],['b', np.nan]) # replace a few different values

df.replace ((''b':r'\s+.\s*), {'b':np.nan},regex = TRUE)

#Numeric replacement

similar to dataframe.fillna

df = pd.dataframe(np.random.randn(10,2))

df[np.random.rand(df.shape[0])>0.5] = 1.5

df.replace(1.5, np.nan)

Replacing more than one value via lists works as well

df00 = df.values[0,0]

df.replace ([1.5, df00], np.nan, 'a'])

Missing data casting rules and indexing

while pandas supports sorting arrays of integer and boolean type, they types are not capable of storing missing data. Until we can switch to using a native NA type in numpy, we have established some 'casting rules" when reindexing will cause missing data to be introduced into ,say, a series or dataframe.

R:

mydataNA <- read.table("mydataNA.txt")

\#read it so that ".", 9, 99 are missing.

mydataNA <- read.table("mydtaNA.txt", na.strings = c(".", "9", "99"))

\# convert 9 and 99 manually

mydataNA <- read.table("mydataNA.txt", na.string = ".")

mydataNA [mydataNA ==9 | mydataNA ==99] <-NA

\# substitute the mean for missing values

mydataNA$q1 [is.na(mydataNA$q1)] <- mean(mydataNA$q1, na.rm = TRUE)

\#eliminate observations with any NAs

myNoMissing <- na.omit(mydataNA)

\# finding complete observations

complete.cases(mydataNA)

\# use that result to select complete cases

myNoMissing <- mydataN[complete.cases(mydataNA), ]

\# use that result to select incomplete cases

myincomplete <- mydataNA [!complete.cases(mydataNA), ]

\# when "99" has meaning

mydataNA <- read.table("mydataNA.txt", na.strings = ".")

\# assign missing values for q variables

mydataNA$q1 [q1 == 9] <- NA

mydataNA$q2[q2 ==9] <- NA

mydataNA$q3 [q3 ==99] <- NA

mydataNA$q4 [q4 == 99] <- NA

10. Use our function

R:

       my9isNA <- function(X){x[x==9] <- NA; x}
       my99isNA <- function(x) {x[x==99]<- NA; x}

       mydataNA[3:4] <- lapply(mydataNA[3:4, my9isNA)
       mydataNA[5:6] <- lapply(mydataNA[5:6], my99isNA)

PYTHON

pandas uses floating value NaN to represent missing data in both floating as well as in non-floating point arrays.

In: string_data = series(['aardvark', 'articoke', np.nan, 'avocado'])

string_data.isnull()

11. Renaming variables

R:

a. using the data editor

fix(mydata) Restore original names for next example names(mydata) <- c("workshop", "gender", "q1", "q2", "q3", "q4")

b. using the reshape2 pakage

 `library("reshape2")  
myChanges <- c(q1 = "x1", q2 = "x2", q3 = "x3", q4 = "x4")  
mydata <- rename(mydata, myChanges)`

c. the standard R approach

 `names(mydata) <- c("workshop", "gender", "x1", "x2", "x3", "x4")`

d. Using the edit function

` names (mydata) <- edit(names(mydata))`

PYTHON: page 1422 and page 1265

The rename() method allow you to relabel an axis based on some mapping (a dict or series) or an arbitrary function

s.rename(str.upper)

for a dict or series:

df.rename(columns = ('one': 'foo', 'two': 'baz'), index = {'a': 'apple', 'b': 'banana', 'd':'durian'})

1) pandas.Series.rename

Series.rename(index = None)

s= pd.Series([1.2.3])

s.rename("my_name") # scalar, changes series.name

s.rename(lambda x: x**2)# function, changes labels

s.rename({1:3, 2:5}) #mapping, changes labels

2) pandas.series.rename_axis

Series.rename.axis(mapper, axis = 0, copy = True, inplace = False)

df = pd.DataFrame({"A": [1,2,3], "B": [4,5,6]})

df.rename_axis("foo") # scalar, alters df.index.name

df.rename_axis(lambda x: 2*x) # function; alter labels

df.rename_axis("A": 'ehh', "C": "see"), axis = "column") # mapping

pandas.dataframe.rename : The exact same with pandas.series.rename

SAS:

12. Renaming by index

R:

   mynames <- names(mydata)
   data.frame(mynames))
   mynames[3] <- "x1"
   mynames[4] <- "x2"
   mynames[5] <- "x3"
   mynames[6] <- "x4"
   names(mydata) <- mynames

a. renaming by column name

 mynames <- names(mydata)
 mynames[mynames == "q1"] <- "x1"
 mynames[mynames =="q2"] <- x2"
 mynames [mynames == "q3"]<- "x3"
 mynames[mynames == "q4"] <- "x4"
 names(mydata) <- mynames

b. renaming many sequentially numbered variable

names(mydata) myXs <- paste("x", 1:4, sep = "") myA <- which(names(mydata) == "q1") myZ <- which(names(mydata) =="q4")

names(mydata) [myA:myZ] < myXs(mydata)

PYTHON:

SAS:

     data mylib.mydata;
     rename q1 - q4 = x1-x4;
     run;

13. Recording variables

a. recoding many variables

R:

library("car")

mydata$qr1 <- recode(q1, "1=2; 5= 4")

mydata$qr2 <- recode(q2, "1=2; 5= 4"

mydata$qr3 <- recode(q3, "1=2; 5=4")

mydata$qr4 <- recode(q4, 1=2; 5=4")

PYTHON:

SAS:

  LIBNAME mylib 'C:\myRfolder';
      data mylib.mydata;
        infile '$\myRfolder\mydata.csv' csv$ delimiter = '$,', $ MISSOVER DSD LRECL = 32767 firstobs = 2;
        INPUT id workshop gender $ q1 q2  q3 q4;
        proc print ; run;
        proc format;
        value agreement 1= "Disagree" 2= "Disagree"
          3 = "Neutral" 4 = "Agree"; 
        run;

      data mylib.mydata;
        set mylib.mydata;
          array q q1 - q4;
          array qr qr1 - qr4;
          do i = 1 to 4;
          qr{i} = q{i}
            if q{i} =  then qr{i} = 2;
            else 
            if q{i} = 5then qr{i} = 4;
          end;
          format q1 - q4 agreement;
        run;

        \#this will use the recorded formats automatically
        proc freq;
          tables q1 - q4;
        run;
        \# this will ignore the formats
        proc univariate;
          var q1-q4;
        run;
        proc univariate;
          var qr1 - qr4;
        run;

14. Indicator or Dummy variables

R:

load("mydata100.RData") attach(mydata100) r <- as.numeric(workshop == "R") sas <- as.numeric(workshop == "SAS") spss <- as.numeric(workshop == "spss") stata <- as.numeric(workshop == "Stata") head(data.frame(workshop, r, sas, spss, stata)) lm(posttest ~ pretest +sas+spss+stata) lm(posttest ~ prestest +workshop) workshop <- relevel (workshop, "SAS") coef(lm(posttest ~ pretest +workshop)) library("nmet") head(class.ind(workshop))

PYTHON:

SAS:

 data temp;
  set mylib.mydata100;
    r = workshop = 1;
    sas = workshop = 2;
    spss = workshop = 3;
    stat = workshop = 4;
  run;

  proc reg;
    model posttest = pretest sas spss stata;
  run;

15. Keeping and Dropping variables

R:

a. using variable selection

myleft <- mydata[, 1:4]

b. using NULL

myleft <- mydata
myleft$q3 <-mydata$q4 <- NULL

PYTHON:

a. reindexing:

in: obj = series [(4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

calling reindex on this series rearranges the data according to the new index, introducing missing if any index values were not already present:

in: obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

out :

a -5.3
b 7.2
c 3.6
d 4.5
e NaN

or obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)

or 
in: obj3 = series(['blue', 'purple', 'yellow'], index = [0,2,4])
obj3.reindex (range(6), method = 'ffill')

reindex method (interpolation) options

ffill or pad: fill (or carry) values forward
bfill or backfill: fill(or carry) values backford

With dataframe, reindex can alter either the (row) index, columns, or both. when passed just a sequence, the rows are reindexed in the result:

b. dropping entries from an axis

dropping one or more entries from an axis is easy if you have an index array or list without those entries. as that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:

in: obj= series(np.arrange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop ('c')

with dataframe, index values can be deleted from either axis:

in : data = dataframe(np.arrange(16).reshape((4,4)),
 index = ['ohio', 'colorado', 'utah', 'new york'], columns = ['one' , 'two', 'three', 'four'])

 data.drop (['Colorado', 'ohio']
 data.drop('two', axis = 1)
 data.drop('two', 'four'], axis = 1)

SAS:

data myleft;
 set mydata;
 keep id workshop gender q1 q2;
run;

data myleft;
  set mydata;
    drop q3 q4;
run;

results matching ""

    No results matching ""