Reading:
1. Comma delimited files
SAS:
proc import out = mylib.mydata
datafile = "c"\myRfolder\mydataID.csv"
DBMS = CSV REPLACE;
GETNAMES = YES;
DATAROW = 2;
RUN;
PYTHON:
read_csv: load delimited data from a file, URL, or file-like object.use comma as default delimiter;
read_table: load delimited data from a file, URL, or file-like object. use tab('\t') as default delimiter;
read_fwf: read data in fixed-width column format(that is, no delimiters);
read_clipboard: version of read_table that reads data from the clipboard. useful for converting tables from web pages.
or
df = pd.table('ch06/ex1.csv', sep =",")
f= pd.read_csv('ch06/ex1.csv', index_col = 0) # pass the column number or column name you wish to use as the index
pd.read_csv ('foo.csv', index_col = 'date')
pd.read_csv('foo.csv', index_col = [0,'A'])# hierarchical index
- for dialect keyword: gives greater flexibility in specifying the file format, uses the excel dialect by default
data = 'a,b,c~1, 2,3~4, 5,6'
pd.read_csv(StringIO(data), lineterminator = '~')
or another common dialect option
data = 'a,b,c\n1,2,3\n4,5,6'
pd.read_csv(StringIO(data), skipinitialspace = True)
- specifying column data types: you can indicate the data type for the whole dataframe or individual columns
data = 'a, b, c\n1, 2,3\n4, 5,6\n7, 8,9'
df = pd.read_csv(StringIO(data), dtype = object)
or
df = pd.read_csv(StringIO(data), dtype = 'b': object, 'c': np.float64))
- specifying categorical dtype
data = 'col1, col2, col3\na, b, 1\na, b, 2\nc, d, 3'
pd.read_csv(StringIO(data), dtype= 'category').dtypes
or for individual column:
pd.read_csv(StringIO(data), dtype= 'col1': 'category')).dtypes
if the categories are numeric they can be converted using the to_numeric () function or the other appropriate converter such as to_datetime()
df= pd.read_csv(StringIO(data), dtype = 'category')
df[col3'.cat.categories = pd.to_numeric(df['col3'].cat.categories)
- Naming and Using columns: a file may or may not have a header row, pandas assumes the first row should be
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
pd.read_csv(StringIO(data), names = ['foo', 'bar', 'baz'], header = 0) # throw away the header
pd.read_csv(StringIO(data), names = ['foo', 'bar', 'baz'], header = None) # keep the raw header
pd.read_csv(StringIO(data), header = 1) # if the header is in a row other than the first, pass the row number to header
- Duplicate names parsing: if the file or header contains duplicate names, pandas by default will deduplicate these names so as to prevent data overwrite:
data = 'a,b', a\n0,1,2\n3,4,5'
pd.read_csv(StringIO(data), mangle_dupe_cols = False) # will arise duplicate data, so a value error will report
- Filtering columns: the usecols argument allows to select any subset of the columns in a file, either using the column names or position numbers:
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
pd.read_csv(StringIO(data), usecols = ['b', 'd'])
pd.read_csv(StrinIO(data), usecols = [0,2,3])
- Comments and Empty lines:
Ignoring line comments and empty lines
data = '\na, b,c\n, \n# commented line\n1,2,3\n\n4,5,6'
pd.read_csv(StringIO(data), comment= '#')
data = 'a,b,c\n\n1,2,3\n\n\n4,5,6'
pd.read_csv(StringIO(data), skip_blank_lines = False)
data = '#comment\na,b,c\nA,B,C\n1,2,3'
pd_read_csv(StringIO(data), comment = '#', skiprows = 2)
comments: if the comments or meta data included in a file
df = pd.read_csv('tmp.csv', comment = '#')
#8. dealing with unicode data
df = pd.read_csv(BytesIO(data), encoding = 'latin-1')
#9 Index columns and trailing delimiters
if a file has one more column of data than the number of column names, the first column will be used as the dataframe's row names:
data = 'a,b,c,\n4,apple,bat,5.7\n8,orange,cow,10'
pd.read__csv(StringIO(data), index__col = 0)
pd.read_csv(StringIO(data), index_col = False)
#10. Date Handling
specifying data columns
To better facilitate working with datatime data, readcsv() and readtable() use the keyword arguments parse_dates and data_parser to allow users to specify a variety of columns and date/time formats to turn the input text data into datetime objects
df = pd.read_csv('foo.csv', index_col = 0, parse_dates = True)
df = pd.read_csv('tmp.csv', header = None, parse_dates = [[1,2], [1,3]]) # for concatenation of the component column names
df = pd.read_csv('tmp.csv', header = None, parse_dates = [[1,2], [1,3]], keep_date_col = True) keep raw date column
date_spec = ('nominal':[1,2], 'actural': [1,3])
df = pd.read_csv('tmp.csv', header = None, parse_dates = date_spec) #specify new column names
df = pd.readcsv('tmp.csv', header = None, parse_dates = date_spec, date_parser = conv.parse_date_time) # since read_csv has a fast path for parsing datetime strings in iso8601 format, so if we can arrange for our data to store datetimes in this format, load times will be significantly faster.
Infering datetime format
df = pd.read_csv('foo.csv', index\_col = 0, parse__dates = True, infer_datetime_format = True)_
International date formats
while US date formats tend to be MM/DD/YYYY, many international formats use DD/MM/YYYY instead, a dayfirst keyword is provided, so always:
pd.read_csv('tmp.csv', dayfirst = True, parse_dates = [0])
#11. Specifying method for floating -point conversion
pd.read_csv(StringIO(data), engine = 'c', float_precision = None ) # can be None, high, round_trip
#12. Thousand separators
df = pd.read_csv('tmp.csv', sep = '|', thousands = ' , ')
#13. NA values
To control which values are parsed as missing values, specified a string in na_values.
read_csv(path, keep_default_na = False, na_values = [" "]) # only empty field will be NaN
read_csv(path, keep_default_na = False, na_values = ["NA", "0"]) # only NA and 0 as strings are NaN
read_csv(path, na_values = ["Nope“]) # the default values, in addition to the string "Nope" are recognized as NaN
#14. Returning series: using the squeeze keyword, the parser will return output with a single column as a series
pd.read_csv('tmp.csv', squeeze = True)
#15. Boolean values
pd.readcsv(StringIO(data), true_values = ['Yes'], false_values = ['Np'])
#16. Handling "bad" lines
Some files may have malformed lines with too few fields or too many. lines with too few fields will have NA values filled in the trailing fields. lines with too many will cause an error by default:
data = 'a,b,c\n1, 2,3\n4,5,6,7,\n8,9,10'
pd.readcsv(StringIO(data), error_bad_lines = False) # will skip bad lines
#17. Quoting and Escape characters
data = 'a,b\n"hello, \"Bob\", nice to see you", 5'
pd.read_csv(StringIO(data), escapechar = '\')
#18. Index
reading an index with a multiindex
print(open('data/mindex_ex.csv').read())
the index_col argument to read_csv and read_table can take a list of column numbers to turn multiple columns into a multiindex for the index of the returned object:
df = pd.read_csv("data/mindexex.csv", index_col = [0,1])
#19. Automatically "sniffing" the delimiter
read_csv is capable if inferring delimited (not necessarily comma -separated) files, as pandas uses the csv.sniffer class of the csv module. for this, you need to specify sep = None )
pd.read_csv('tmp2.csv', sep = None, engine = 'python')
#20. Iterating through files chunk by chunk
Suppose you wish to iterate through a (potentially very large) file lazily rather than reading the entire file into memory.
reader = pd.read_table('tmp.csv', sep = '|' , chunksize = 4)
for chunk in reader:
print(chunk)
or specifying interator = True will also return the TextFileReader object:
reader = pd.read_table('tmp.csv', sep = '|', iterator = True)
read.get(chunk(5)
a. Assign column names
pd.read_csv('ch06/ex2.csv', names = ['a','b', 'c', 'message'])
b. Assign the index
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ch06/ex2.csv', names = names, index_col = 'message') # or can assign two index_col
c. Skip specified rows
pd.read_csv('ch06/ex4.csv', skiprows = [0, 2,3])
d. Handling missing values
result = pd.read_csv('ch06/ex5.csv', na_values = ['NULL'])
Different NA sentinesl can be specified for each column in a dict:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ch06/exe5.csv', na_values = sentinels)
e. Only read out a small number of rows
pd.read_csv('ch06/ex6.csv', nrows = 5)
R:
a. With id variable not named
mydata <- read.csv("mydata.csv")
b. With id named in the header
mydata <- read.csv("mydataID.csv", row.names = "id")
2. Tab delimited files:
SAS:
PROC IMPORT OUT = mylib.mydata
DATAFILE = "C:\myRworkshop\mydataID.tab"
DBMS = TAB REPLACE;
GETNAMES = YES;
DATAROW =2;
RUN;
PYTHON:
df = pd.table('ch06/ex1.csv', sep ="\n")
R:
a. Read a tab delimited file with named ID columns
mydata <- read.delim("mydata.tab")
count.fields("mydata.tab", sep = "\t")
b. With ID named in the header
mydata <- read.delim("mydataID.tab",row.names = "id")
3. Reading from a web site: FILENAME myURL
SAS:
FILENAME myURL URL
"http://sites.google.com/site/r4statistics/mydataID.csv";
PROC IMPORT DATAFILE = myURL
DBMS = CSV REPLACE
OUT = mylib.mydata;
GETNAMES = YES;
DATAROW =2;
RUN;
R:
myURL <- "http:/sites.google.com/site/r4statistics/mydata.csv"
mydata<- read.csv(myURL)
4. Reading text from the clipboard
SAS: (NEED TO CONFIRM)
PYTHON:
read_clipboard: version of read_table that reads data from the clipboard. useful for converting tables from web pages
A handy way to grab data is to use the read_clipboard method, which takes the contents of the clipboard buffer and passes them to the read_table method. for instance, you can copy the following text to the clipboard (CTRL-C) on many operating systems)
clipdf = pd.read_clipboard()
R:
a. Copy a column of numbers or words, then :
myvector <- readClipboard()
b. Open mydata.csv, select & copy contents, then :
mydata <- read.delim ("clipboard", header = TURE)
c. Missing values for character varables
mydata <- read.csv("mydataID.csv", row.names = "id", strip.white = TRUE, na.strings = "")
d. Skipping variables in delimited text files
myCols <- read.delim("mydata.tab", strip.white = TRUE, na.strings = "", colClasses = c("integer", "integer", "character", "NULL", "NULL", "integer", "integer"))
5. Reading text data within a program
R:
a. The stdin approach
mydata <- read.csv(stdin())
workshop, gender, q1, q2, q3, q4
1,1,f,1,1,5,1
b. Blank line above ends input
c. The testConnection approach
mystring <- "workshop, gender, q1, q2, q3, q4
1, 1, f,1 1, 1, 5, 1"
mydata <- read.csv(textConnection(mystring))
SAS:
LIBNAME myLib 'C:\myRfolder';
DATA mylib.mydata;
INFILE DATALINES DELIMITER = ','
MISSOVER DSD firstobs = 2;
INPUT id workshop gender $ q1 q2 q3 q4; DATALINES;
id, workshop, gender, q1 q2, q3, q4 1,1,f,1,1,5,1
PROC PRINT;
RUN;
6. Reading multiple observations per line
R:
a. The stdin approach
mylist <- scan(stdin(),
what = list(id = 0, workshop = 0, gender = " ",
q1 =0, q2 = 0, q3, = 0, q4 = 0))
1 1 f 1 1 5 1
b. Blank line above ends input
mydata <- data.frame(mylist)
c. The textConnection approach
mystring <- "1 1 f 1 1 5 1";
mylist <- scan(textConnection(mystring),
what = list(id =0, workshop = 0, gender = " ",
q1 =0, q2 = 0, q3 = 0, q4 =0 ))
mydata <- data.frame(mylist)
SAS:
DATA mydata;
INPUT id workshop gender $q1 - $q4 @@;
DATALINES; 1 1 f 1 1 5 1 ;
PROC PRINT;
RUN;
7. Reading fixed-width text files : one record per case
R:
a.
mydata <- read.fwf(
file = "mydataFWF.txt",
width = c(2, -1, 1, 1, 1,1,1 ),
col.names = c("id", "gender", "q1", "q2", "q3", "q4"), row.names = "id",
na.strings = "",
fill = TRUE,
strip.white = TRUE)
b. Using "macro substitution".
myfile <- "mydataFWF.txt"
myvariablenames <- c("id", "gender", "q1", "q2", "q3", "q4")
myvariablewidths <- c(2, -1, 1,1,1,1,1)
mydata <- read.fwf( file = myfile, width = myVariableWidths, col.names = myVariableNames,
row.names = "id", na.strings = " ", fill = TRUE, strip.white = TRUE)
PYTHON:
#column specifications are a list of half-intervals
colspecs = [(0,6], (8,20), (21, 33), (34, 43)]
df = readfwf('bar.csv', colspecs = colspecs, header = None, index_col = 0)
#also can supply just the column widths for contiguous column
widths = [6, 14,13, 10]
df = pd.read_rwf('bar.csv', widths = widths, header = None)
SAS:
LIBNAME mylib 'C:\myRolder';
DATA myLib.mydata;
INFILE '\myRfolder\mydataFWF.txt'
MISSOVER;
INPUT ID 1-2 WORKSHOP 3 GENDER $ 4 q1 5 q2 6 q3 7 q4 8;
RUN;
8. Reading fixed - width text files, two or more records per case
R:
myfile <- "mydataFWF.txt"
myVariableNames <- c ("id", "workshop", "gender", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8")
myRecord1Widths <- c(2,1,1,1,1,1,1)
myRecord2Widths <- c(-2, -1, -1, 1,1,1,1)
myVariableWidths <- list(myRecord1Widths, myRecord2Widths)
mydata <- read.fwf( file = myfile, width = myVariableWidths, col.names = myVariableWidths, row.names = "id", na.strings = " ", fill = TRUE, strip.white = TRUE)
PYTHON:
Same with above
SAS:
DATA temp;
INFILE '\myRfolder\mydataFWF.txt' MISSOVER;
INPUT \#1 id 1-2 workshop 3 gender 4 q1 5 q2 6 q3 7 q4 8 \#2 q5 5 q6 6 q7 7 q8 8;
PROC PRINT;
RUN;
9. Reading excel files
R:
library("xlsReadWrite")
xls.getshlib():
can get a binary file that is not distributed through CRAN
mydata <- read.xls("mydata.xls")
PYTHON:
The read_excel method can read excel 2003 (.xls) and excel 2007(.xlsx) files using the xlrd python module, the to_excel instance method is used for saving a dataframe to excel. generally, the semantics are similar to working with csv data.
read_excel (['path_to_file.xls', sheetname = 'Sheet1')
or
xlsx = pd.ExcelFile('path__to__file.xls')
df = pd.read_excel(xlsx, 'Sheet1')
or
with pd.ExcelFile('path__to__file.xls') as xls:
df1 = pd.read_excel(xls, 'Sheet1')
df2 = pd/read_excel(xls, 'Sheet2')
if an excelfile is parsing multiple sheets with different parameters:
data =()
with pd.excelfile('path__to__file.xls') as xls:
data['Sheet1'] = pd.read__excel(xls, 'Sheet1', index_col = None, na_values = ['NA'])
data['Sheet2'] = pd.readexcel(xls, 'Sheet2', index_col = 1)
if the same parsing parameters are used for all sheets, a list of sheet names can simply be passed to read_excel with no loss in performance.
data = readexcel('pathtofile.xls', ['Sheet1', 'Sheet2'], index_col = None, na_values = ['NA'])_
Parsing specific columns:
readexcel('pathtofile.xls', 'Sheet1', parse_cols = [0,2,3]);
SAS:
LIBNAME mylib "c:\myRfolder";
PROC IMPORT OUT = mylib.mydata
DATAFILE = "C:\myRfolder\mydata.xls"
DBMS = EXCELCS REPLACE;
RANGE = "Sheet1$";
SCANTEXT = YES;
USEDATE = YES;
SCANTIME = YES;
run;
10. Reading from relational databases
R:
library("RODBC")
myConnection <- odbcConnectExcel("mydata.xls")
mydata <- sqlFetch(myConnection, "Sheet1")
close(myConnection)
SAS:
11. Reading HDF5(only for PYTHON)
12. Reading data from SAS
R:
library("foreign")
mydata <- read.ssd("c:/myRfolder", "mydata",
sascmd - "C:/Program files/SAS/SASFoundation/9.2/sas.exe")
library("Hmisc")
mydata <- sasxport.get("mydata.xpt")
PYTHON:
The top-level function read_sas() currently can read(but not write) SAS xport(.XPT) format files. pandas can not currently handle SAS7BDAT files.
XPORT files only contain two value types: ASCII text and double precision numeric values. there is no automatic type conversion to integers , dates, or categoricals. by default the whole file is read and returned as a DataFrame.
df = pd.read_sas('sas_xport.xpt')
or
rdr = pd.readsas('sas_xport.xpt', chunk = 100000)
for chunk in rdr:
do_something(chunk)
13. Reading from sql
The pandas.io.sql module provides a collection of query wrappers to both facilitate data retrieval and to reduce dependency on DB- specific API. Database abstraction is provided by SQLAlchemy if installed, in addition you will need a driver library for your database.
if SQLAlchemy is not installed, a fallback is only provided for sqlite(and for mysql for backwards compatibility, but this is deprecated and will be removed in a future version). This mode requires a Python database adapter which respect the Python DB-API.
The key functions are :
Code | Function |
---|---|
read_sq_table(table_name, con[, schema, ...]) | Read SQL database table into a DataFrame |
read_sql_query(sql, con[, index_col, ...]) | Read SQL query into a Dataframe |
read_sql(sql, con[ , index_col , ...]) | Read SQL query or database table into a dataframe |
DataFrame.to_sql (name, con[,flavor, ...]) | Write records stored in a DataFrame to a SQL database |
pandas.read_sql_table: Give a table name and SQLAlchemy connectable, returns a DataFrame, this function does not support DBAPI connection.
pandas.read_sql_query: returns a dataframe corresponding to the result set of the query string.
con: SQLAlchemy connectable (engine/connection) or database string URI
or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. if a DBAPI2 object, only sqlite3 is supported.
Any datetime values with time zone information parsed via the parse_dates parameter will be converted to UTC
pandas.read_sql: read SQL query or database table into a Dataframe
In the following example, we use the SQlite SQL database engine. you can use a temporary SQLite database where data are stored in "memory"
To connect with SQLAlchemy you use the create_engine() function to create an engine object from database URI. you only need to create the engine once per database you are connecting to.
from sqlalchemy import create_engine
#create your engine:
engine = create_engine('sqlite:///:memory:')
if you want to mange your own connections you can pass one of those instead:
with engine.connect() as conn, conn.begin();
data = pd.read_sql_table('data', conn)
#Writing DataFrames
data.to_sql('data', engine, chunksize = 1000)
#SQL data types
to_sql() will try to map your data to an appropriate SQL data type based on the dtype of the data. when you have columns of dtype object, pandas will try to infer the data type.
you can always override the default type by specifying the desired SQL type of any of the columns by using the dtype argument. this argument needs a dictionary mapping column names to SQLAlchemy types (or strings for the sqlite3 fallback mode). for example, specifying to use the sqlalchemy string type instead of the default Text type for string columns:
from sqlalchemy.types import string
data.to_sql ('data_dtype', engine, dtype = {'Col_1': String})
#1. Reading Tables
read_sql_table() will read a database table given the table name and optionally a subset of columns to read.
In order to use read_sql_table() , you must have the SQLAlchemy optional dependency installed.
pd.read_sql\_table('data', engine, index_col = 'id') # specify the name of the column as the DataFrame index\
pd.read_sql_table('data', engine, columns = ['col_1', 'col_2']) # specify a subset of columns to be read_
pd.read_sql\_table('data', engine, parse_dates = ['Date']) #explicitly force columns to be parsed as dates\
pd.read_s_ql__table('data', engine, parse_dates = {'Date': '%Y-%m-%d'})
pd.read_sql_table('data', engine, parse_dates = {'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) #speicfy a format string
#2. Querying
You can query using raw SQL in the read_sql_query() function. In this case you must use the SQL variant appropriate for your database.When using SQLAlchemy, you can also pass SQLAlchemy expression language constructs, which are database-agnostic.
pd.read_sql_query('SELECT * FROM data', engine)
or
pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine)_
or
for chunk in pd.read_sql\_query ("SELECT * FROM data_chunks", engine, chunksize = 5):\
print (chunk) #support chunksize argument
you can also run a plain query without creating a dataframe with execute(). this is useful for queries that don't return values, such as INSERT. This is functionally equivalent to calling execute on the SQLAlchemy engine or db connection object.
from pandas.io import sql
sql.execute('SELECT * FROM table_name', engine)
sql.execute('INSERT INTO table_name VALUES(?, ?, ?)', engine, params = [('id', 1,12.2, True)])
Engine connection examples
To connect with SQLAlchemy you use the create_engine() function to create an engine object from database URI. you only need to create the engine once per database you are connecting to.
from sqlalchemy import create_engine
engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')
or
engine = create_engine('oracle://scott:[email protected]: 1521/sidname')
Writing:
1. Write data from SAS and read it into R(only for R)
LIBNAME mylib 'C:\myRfolder';
LIBNAME To\_R xport '\myRfolder\mydata.xpt';
DATA To_R.mydata;
set mylib.mydata;
RUN;
\# read a SAS data set
\# read ssd or sas7bdat if you have SAS installed
library("foreign")
mydata <- read.ssd("c:/myRfolder“, ”mydata", sascmd = "C:/program files /SAS/SASFoundation/9.2/sas.exe")
\# reads SAS export format without installing SAS library("foreign")
library("Hmisc")
mydata <- sasxport.get("mydata.xpt")
2. Writing delimited text files
SAS:
PROC PRINT DATA = mylib.mydata;
run;
PROC EXPROT DATA = mylib.mydata
outfile = "C"\myFolder\mydataFromSAS.csv"
DBMS = CSV REPLACE;
PUTNAMES = YES;
RUN;
PROC EXPORT DATA = mylib.mydata
outfile = "C"\myFolder\mydataFromSAS.txt"
DBMS = TAB REPLACE;
PUTNAMES = YES;
RUN;
PYTHON:
data.to_csv('ch06/out.csv')
or
data.to_csv(sys.stdout,sep = '|')
demote the missing value by some other sentinel value;
data.to_csv(sys.stdout, na_rep = 'NULL')
or
data.to_csv(sys.stdout,index = False, header = False)
or
dat.to_csv(sys.dout,index = False, cols = ['a', 'b', 'c'])
R:
1. write.csv(mydata, "mydataFromR.csv")
2. write.table(mydata, "mydataFromR.txt")
3. write.table(mydata, file = "mydataFromR.txt",
quote = FALSE, sep = "\t", na = " ",
row.names = TRUE, col.names = TRUE)
3. Viewing a text fileViewing a text file(only for R)
file.show("mydataFromR.csv")
4. Writing Excel files
R:
library("xlsReadWrite")
xls.getshlib()
load("mydata.RData")
write.xls (mydata, "mydataFromR.xls")
SAS:
LIBNAME mylib "c:\myFolder";
PROC EXPORT DATA = mylib.mydata
OUTFILE = "C:\myFolder\mydata.xls"
DBMS = EXCELCS LABEL REPLACE;
SHEET = "mydata";
RUN;
PYTHON:
df.to_excel ('foo.xlsx', sheet_name = 'Sheet1')
5. Writing to relational databases
R:
library("RODBC")
myConnection <- odbcConnectExcel("mydataFromR.xls", readOnly = FALSE)
sqlSave(myConnection, mydata)
close(myConnection)
PYTHON:
SAS:
6. Writing data to SAS (only for R)
library("foreign")
write.foreign (mydata, datafile = "mydataFromR.csv", codefile = "mydata.sas", package = "SAS")