Python Other
PyPi - Python Package Index
What is PyPi?
pip
- pip
$ pip install package_name
Configure pip on Windows to avoid SSL issues
On the command line:
pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org PACKAGE_NAME
Run the following command to get the list of configuration files:
pip config -v list
You will see something like this: (your username instead of FooBar)
For variant 'global', will try loading 'C:\ProgramData\pip\pip.ini'
For variant 'user', will try loading 'C:\Users\FooBar\pip\pip.ini'
For variant 'user', will try loading 'D:\Data\Users\FooBar\AppData\Roaming\pip\pip.ini'
For variant 'site', will try loading 'C:\Users\FooBar\AppData\Local\Programs\Python\Python310\pip.ini'
Create the first pip.ini file with the following content:
[global]
trusted-host = pypi.org files.pythonhosted.org pypi.python.org
If you run the pip config -v list again, you'll see an additional line on the output:
global.trusted-host='pypi.org, files.pythonhosted.org ,pypi.python.org'
pip will now disregard the SSL issues.
Upgrade pip
pip install --upgrade pipWill probably not work on Windows because file is in use...
Upgrade PIP on Windows
py -m pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --upgrade pip
PYTHONPATH
export PYTHONPATH=~/python
Requirements
numpy
pandas
requests
flask>=1.00
pip install -r requirements.txt
Virtualenv
- virtualenv
On Linux/macOS:
$ cd project_dir
$ virtualenv -p python3 venv
$ source venv/bin/activate
$ ...
$ deactivate
On Windows:
venv\Scripts\activate.bat
...
deactivate
The virtualenv command will create a copy of python in the given directory inside the current directory.
In the above example it will create the copy in the 'venv' directory inside the 'project_dir'.
After source-ing the 'activate' file the PATH will include the local python with a local version of pip.
This requires bash or zsh.
See also the Python guide.
Python Web server
Hello world web
- WSGI
- CGI
from wsgiref.util import setup_testing_defaults
from wsgiref.simple_server import make_server
import time
def hello_world(environ, start_response):
setup_testing_defaults(environ)
status = '200 OK'
headers = [('Content-type', 'text/plain; charset=utf-8')]
start_response(status, headers)
res = f"Hello World {time.time()}".encode('utf-8')
return [res]
port = 8080
with make_server('0.0.0.0', port, hello_world) as httpd:
print("Serving on port {}...".format(port))
httpd.serve_forever()
Dump web environment info
from wsgiref.util import setup_testing_defaults
from wsgiref.simple_server import make_server
# A relatively simple WSGI application. It's going to print out the
# environment dictionary after being updated by setup_testing_defaults
def simple_app(environ, start_response):
setup_testing_defaults(environ)
status = '200 OK'
headers = [('Content-type', 'text/plain')]
start_response(status, headers)
ret = ["{}: {}\n".format(key, value)
for key, value in environ.iteritems()]
return ret
httpd = make_server('', 8000, simple_app)
print("Serving on port 8000...")
httpd.serve_forever()
# taken from the standard documentation of Python
Web echo
from wsgiref.util import setup_testing_defaults
from wsgiref.simple_server import make_server
import time
import cgi
def hello_world(environ, start_response):
setup_testing_defaults(environ)
status = '200 OK'
headers = [('Content-type', 'text/html')]
start_response(status, headers)
form = cgi.FieldStorage(fp=environ['wsgi.input'], environ=environ)
if 'txt' in form:
return 'Echo: ' + form['txt'].value
return """
<form>
<input name="txt" />
<input type="submit" value="Echo" />
</form>
"""
httpd = make_server('', 8000, hello_world)
print("Serving on port 8000...")
httpd.serve_forever()
Web form
from wsgiref.util import setup_testing_defaults
from wsgiref.simple_server import make_server
import time
import cgi
def hello_world(environ, start_response):
setup_testing_defaults(environ)
status = '200 OK'
headers = [('Content-type', 'text/html')]
start_response(status, headers)
form = cgi.FieldStorage(fp=environ['wsgi.input'], environ=environ)
html = ''
for f in form:
html += f + '==' + form[f].value + '<br>'
if not html:
html = """
<a href="?fname=Foo&lname=Bar">click</a>
<form>
Username: <input name="username" /><br>
Password: <input type="password" name="pw" /><br>
Age group: Under 18 <input type="radio" name="age" value="kid" >
18-30 <input type="radio" name="age" value="young" >
30- <input type="radio" name="age" value="old" >
<input type="submit" value="Send" />
</form>
"""
return html
httpd = make_server('', 8000, hello_world)
print("Serving on port 8000...")
httpd.serve_forever()
Resources
Networking
Secure shell
ssh
- On Windows install putty
import subprocess
import sys
if len(sys.argv) !=2:
exit("Usage: " + sys.argv[0] + " hostname")
host = sys.argv[1]
command = "uname -a"
ssh = subprocess.Popen(["ssh", host, command],
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result = ssh.stdout.readlines()
error = ssh.stderr.readlines()
if error:
for err in error:
sys.stderr.write("ERROR: {}\n".format(err))
if result:
print(result)
ssh from Windows
$ ssh foobar@hostname-or-ip
-o "StrictHostKeyChecking no"
$ plink.exe -ssh foobar@hostname-or-ip -pw "password" -C "uname -a"
$ plink.exe", "-ssh", "foobar@username-or-ip", "-pw", "no secret", "-C", "uname -a"
import subprocess
import sys
ssh = subprocess.Popen([r"c:\Users\foobar\download\plink.exe", "-ssh",
"foobar@username-or-ip",
"-pw", "password",
"-C", "uname -a"],
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result = ssh.stdout.readlines()
error = ssh.stderr.readlines()
if error:
for err in error:
sys.stderr.write("ERROR: {}\n".format(err))
if result:
print(result)
Parallel ssh
- parallel-ssh
- pip install parallel-ssh
from pssh import ParallelSSHClient
hosts = ['myhost1', 'myhost2']
client = ParallelSSHClient(hosts)
output = client.run_command('ls -ltrh /tmp/', sudo=True)
telnet
import telnetlib
hostname = '104.131.87.33'
user = 'gabor'
password = 'robag'
tn = telnetlib.Telnet(hostname)
tn.read_until("login: ")
tn.write(user + "\n")
tn.read_until("Password: ")
tn.write(password + "\n")
tn.read_until("~$")
tn.write("hostname\n")
print(tn.read_until("~$"))
print("-------");
tn.write("uptime\n")
print(tn.read_until("~$"))
print("-------");
print("going to exit")
tn.write("exit\n")
print("--------")
print(tn.read_all())
prompt for password
import getpass
password = getpass.getpass("Password:")
print(password)
ftp
$ sudo aptitude install proftpd
$ sudo /etc/init.d/proftpd start
$ sudo adduser (user: foo pw: bar)
from ftplib import FTP
ftp = FTP('localhost')
ftp.login("foo", "bar")
print(ftp.retrlines('LIST'))
print('-------')
for f in ftp.nlst():
print("file: " + f)
filename = 'ssh.py'
ftp.storlines("STOR " + filename, open(filename))
print('-------')
for f in ftp.nlst():
print("file: " + f)
ftp.delete(filename)
print('-------')
for f in ftp.nlst():
print("file: " + f)
-rw-rw-r-- 1 foo foo 6 Feb 18 19:18 a.txt
-rw-rw-r-- 1 foo foo 6 Feb 18 19:18 b.txt
226 Transfer complete
-------
file: b.txt
file: a.txt
-------
file: b.txt
file: a.txt
file: ssh.py
-------
file: b.txt
file: a.txt
Interactive shell
The Python interactive shell
- len
Type python without any arguments on the command line and
you'll get into the Interactive shell of Python.
In the interactive shell you can type:
>>> print "hello"
hello
>>> "hello"
'hello'
>>> 6
6
>>> len("abc")
3
>>> "abc" + 6
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: cannot concatenate 'str' and 'int' objects
>>> "abc" + str(6)
'abc6'
REPL - Read Evaluate Print Loop
- int
- float
- REPL
A variable comes to existence the first time we assign a value to it. It points to an object and that object knows about its type.
>>> a = "abc"
>>> len(a)
3
>>> a = '3'
>>> a + 3
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: cannot concatenate 'str' and 'int' objects
>>> int(a) + 3
6
>>> a = '2.3'
>>> float(a) + 1
3.3
Using Modules
- import
- sys
- version
- executable
Python has lots of standard (and not standard) modules. You can load one of them using the
import keyword. Once loaded, you can use functions from the module
or access its objects. For example the sys module has a sys.version
and a sys.executable variable.
>>> import sys
>>> sys.version
'2.7.3 (default, Apr 10 2012, 23:24:47) [MSC v.1500 64 bit (AMD64)]'
>>> sys.executable
'c:\\Python27\\python.exe'
You can also load specific object directly into your code.
>>> from sys import executable
>>> executable
'c:\\Python27\\python.exe'
To quit the interpreter call the exit() function.
>>> exit
Use exit() or Ctrl-Z plus Return to exit
The import binds the word sys to whatever it loaded from the file.
Getting help
- help()
- dir()
- import
>>> help
Type help() for interactive help, or help(object) for help about object.
>>> help() - entering an internal shell:
...
help> dir - explains about the dir command. Navigate using SPACE/ENTER/q
help> Ctrl-D - to quite, (Ctrl-Z ENTER on Windows)
>>> help(dir) - the same explanation as before
>>> dir()
['__builtins__', '__doc__', '__name__', '__package__']
>>> dir("") - list of string related methods
['__add__', '__class__', ... 'upper', 'zfill']
>>> dir(1) - list of integer related methods
['__abs__', '__add__', ... 'numerator', 'real']
>>> dir(__builtins__)
... - functions available in python
>>> help(abs) - exlain how abs() works
>>> help(sum)
>>> help(zip)
>>> help(int)
>>> help(str)
>>> help("".upper) - explain how the upper method of strings work
>>> import sys
>>> dir(sys)
>>> help(sys)
>>> help(sys)
>>> help(sys.path)
>>> help(sys.path.pop)
Exercise: Interactive shell
- Start the REPL and check the examples.
- Check the documentation in the REPL.
Advanced lists
Change list while looping: endless list
numbers = [1, 1]
for n in numbers:
print(n)
numbers.append(numbers[-1] + numbers[-2])
if n > 100:
break
print(numbers)
Creating a Fibonacci series in a crazy way.
Change list while looping
Probably not a good idea...
numbers = [1, 2, 3, 4]
for n in numbers:
print(n)
if n == 2:
numbers.remove(2)
print(numbers)
1
2
4
[1, 3, 4]
Note, the loop only iterated 3 times, and it skipped value 3
Copy list before iteration
It is better to copy the list using list slices before the iteration starts.
numbers = [1, 2, 3, 4]
for n in numbers[:]:
print(n)
if n == 2:
numbers.remove(2)
print(numbers)
1
2
3
4
[1, 3, 4]
for with flag
names = ['Foo', 'Bar', 'Baz']
ok = False
for i in range(3):
name = input('Your name please: ')
if name in names:
ok = True
break
if not ok:
print("Not OK")
exit()
print("OK....")
for else
The else statement of the for loop is executed when the iteration ends normally. (without calling break)
names = ['Foo', 'Bar', 'Baz']
for i in range(3):
name = input('Your name please: ')
if name in names:
break
else:
print("Not OK")
exit()
print("OK....")
enumerate
- enumerate
names = ['Foo', 'Bar', 'Baz']
for i in range(len(names)):
print(i, names[i])
print('')
for i, n in enumerate(names):
print(i, n)
0 Foo
1 Bar
2 Baz
0 Foo
1 Bar
2 Baz
do while
- do while
There is no do-while in Python, but you can emulate it:
while True:
do_stuff()
if not loop_condition():
break
x = 0
while True:
x += 1
print(x)
if x > 0:
break
list slice is copy
x = [1, 1, 2, 3, 5, 8, 13, 21, 34]
y = x[2:5]
print(y) # [2, 3, 5]
x[2] = 20
print(x) # [1, 1, 20, 3, 5, 8, 13, 21, 34]
print(y) # [2, 3, 5]
Warnings
Warnings
- warn
from warnings import warn
def foo():
warn("foo will be deprecated soon. Use bar() instead", DeprecationWarning)
print("foo still works")
def main():
foo()
print("afterfoo")
main()
Tox
Tox Examples
def add(x, y):
return x+y
from setuptools import setup
setup(name='mymath',
version='0.2',
description='The best math library',
url='http://github.com/szabgab/mymath',
author='Foo Bar',
author_email='foo@bar.com',
license='MIT',
packages=['mymath'],
zip_safe=False,
requires=[
],
long_description='Long description',
scripts=[],
)
import mymath
def test_add():
assert mymath.add(2, 3) == 5
{% embed include file="src/examples/tox/tox.ini)
Selenium
Selenium installation
Get started with Selenium
from selenium import webdriver
import chromedriver_autoinstaller
import sys
import re
import time
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} URL")
url = sys.argv[1]
chromedriver_autoinstaller.install()
options = webdriver.ChromeOptions()
#options.add_argument('headless')
driver = webdriver.Chrome(options=options)
driver.get(url)
driver.fullscreen_window()
print(driver.title)
time.sleep(5)
box = driver.find_element_by_id('search_box')
box.send_keys("selenium")
time.sleep(5)
box.send_keys(u'\ue007') # press enter on the box
time.sleep(5)
# element = driver.find_element_by_class_name('')
# element.is_displayed()
# print(element.get_attribute('href'))
# print(element.text)
# match = re.search(r'Code', driver.page_source)
# print(match)
# button = driver.find_element_by_class_name('')
# button.click()
#import code
#code.interact(local=locals())
#from ptpython.repl import embed
#embed(globals(), locals())
driver.close()
Selenium Headless Screenshot
from selenium import webdriver
import chromedriver_autoinstaller
import sys
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} URL")
url = sys.argv[1]
chromedriver_autoinstaller.install()
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)
driver.get(url)
print(driver.title)
driver.get_screenshot_as_file('screenshot.png')
driver.close()
Playwright
Playwright installation
pip install playwright
playwright install
Playwright demo
from playwright.sync_api import sync_playwright
import sys
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} URL")
url = sys.argv[1]
with sync_playwright() as play:
for browser_type in [play.chromium]: #, play.firefox, play.webkit]:
browser = browser_type.launch(headless=False)
page = browser.new_page()
page.goto(url)
search_box = page.query_selector("#search_box");
#from ptpython.repl import embed
#embed(globals(), locals())
#page.screenshot(path=f'example-{browser_type.name}.png')
browser.close()
Playwright screenshot
from playwright.sync_api import sync_playwright
with sync_playwright() as play:
for browser_type in [play.chromium]: #, play.firefox, play.webkit]:
browser = browser_type.launch()
page = browser.new_page()
page.goto('http://whatsmyuseragent.org/')
page.screenshot(path=f'example-{browser_type.name}.png')
browser.close()
Advancted functions
Variable scopes
- Local (inside a def)
- Enclosing (in the enclosing def, aka. nonlocal)
- Global (outside of all defs)
Name resolution order (LEGB)
- Local
- Enclosing
- Global
- Built-in
Scoping: global seen from fuction
a = 42
def f():
print(a)
f()
42
Assignment creates local scope
a = 42
def f():
a = 23
print(a)
print('ok')
print(a)
f()
print(a)
ok
42
23
42
Local scope gone wrong
a = 42
def f():
print(a)
a = 23
print('ok')
print(a)
f()
print(a)
ok
42
Traceback (most recent call last):
File "scoping_external_variable.py", line 8, in <module>
f()
File "scoping_external_variable.py", line 3, in f
print(a)
UnboundLocalError: local variable 'a' referenced before assignment
Accessing a global variable inside a function works, but if I change it (make it refer to another piece of data), then it is disallowed. If I only change the data inside (for mutable variables), that works, but is a bad practice.
Changing global variable from a function
a = 42
def f():
global a
print(a)
a = 23
print(a) # 42
f() # 42
print(a) # 23
Does not need to be created outside
def f():
global a
a = 23
f()
print(a) # 23
Global variables mutable in functions
a = [2]
def f():
print(a) # [2]
a.append(3)
print(a) # [2, 3]
a[0] = 4
f()
print(a) # [4, 3]
Scoping issues
text = ['aaaa', 'bb', 'ccc ccc']
length_1 = [ len(s) for s in text ]
print(length_1) # [4, 2, 7]
length_2 = [ len(s) for x in text ]
print(length_2) # [7, 7, 7]
List comprehensions don't create their own scope!
sub in sub
Functions can be defined inside functions.
def f():
print("in f")
def g():
print("in g")
g()
f()
#g() # does not exist here
They are scoped locally
Scoping sub in sub (enclosing scope)
def external_func():
the_answer = 42
def func(args):
print(args, "the_answer:", the_answer)
# the_answer = 'what was the question?'
# enabling this would give:
# UnboundLocalError: local variable 'the_answer'
# referenced before assignment
func("first")
func("second")
external_func()
{% embed include file="src/examples/advanced-functions/scoping_internal_sub.out)
Function objects
The difference between
x = foo
y = foo()
c = 0
def foo():
global c
c += 1
return c
print(foo()) # 1
print(foo()) # 2
x = foo # assigning the function object
y = foo() # assigning the return value of the function
print(foo()) # 4
print(x()) # 5
print(y) # 3
Functions are created at run time
def and class are run-time Everything is runtime. Even compilation is runtime.
foo() will return a random value every time, but when bar is defined it freezes the specific value that foo returned when bar was created.
import random
def foo():
return random.random()
print(foo())
print(foo())
def bar(a, b = foo()):
return [a, b]
print(bar(1))
print(bar(2))
{% embed include file="src/examples/advanced-functions/runtime-def.out)
Mutable default
The default list assigned to b is created when the f functions is defined. After that, each call to f() (that does not get a "b" parameter) uses this common list.
def f(a, b = []):
b.append(a)
return b
print(f(1))
print(f(2))
print(f(3))
{% embed include file="src/examples/advanced-functions/mutable_default_parameter.out)
Use None instead:
Use None as default parameter
def f(a, b = None):
if b == None:
b = []
b.append(a)
return b
print(f(1))
print(f(2))
print(f(3))
{% embed include file="src/examples/advanced-functions/none_as_default_parameter.out)
Inner function created every time the outer function runs
Also defined during run-time, but in every call of bar() the innter_func is redefined again and again.
import random
def foo():
return random.random()
print(foo())
print(foo())
def bar(a, b = foo()):
def inner_func(x, y = foo()):
return [x, y]
print('inner', inner_func(a))
return [a, b]
print(bar(1))
print(bar(2))
{% embed include file="src/examples/advanced-functions/runtime-inner-def.out)
Static variable
- static
There are no function-level static variables in Python, but you can fake it quite easily
def counter():
if 'cnt' not in counter.__dict__:
counter.cnt = 0
counter.cnt += 1
return counter.cnt
print(counter()) # 1
print(counter()) # 2
print(counter()) # 3
print(counter.cnt) # 3
counter.cnt = 6
print(counter()) # 7
Static variable in generated function
def create():
def func():
func.cnt += 1
return func.cnt
func.cnt = 0
return func
a = create()
b = create()
print(a()) # 1
print(a()) # 2
print(b()) # 1
print(a()) # 3
b.cnt = 7
print(a.cnt) # 3
print(b.cnt) # 7
Inspect
The inspect module provides introspection to Python runtime.
inspect.stack returns the stack-trace. Element 0 is the deepes (where we called inspect stack).
Each level has several values. A represantation of the frame, filename, linenumber, subroutine-name.
import inspect
import sys
level = int(sys.argv[1])
def f():
print("in f before g")
g()
print("in f after g")
def g():
print("in g")
PrintFrame()
def PrintFrame():
st = inspect.stack()
frame = st[level][0]
info = inspect.getframeinfo(frame)
print('__file__: ', info.filename)
print('__line__: ', info.lineno)
print('__function__: ', info.function)
print('* file', st[level][1])
print('* line', st[level][2])
print('* sub', st[level][3])
f()
python caller.py 1
in f before g
in g
__file__: caller.py
__line__: 15
__function__: g
* file caller.py
* line 15
* sub g
in f after g
Variable number of function arguments
Python function arguments - a reminder
- Order of parameter
- Arguments with default values are optional (and come at the end of the definition)
- Number of arguments is know at the time of function definition. The only flexibility is provided by the optional arguments.
def f(a, b = 42):
print(a)
print(b)
f(23)
# 23
# 42
f(19, 11)
# 19
# 11
f(b=7, a=8)
# 8
# 7
# f() # (runtime) TypeError: f() takes at least 1 argument (0 given)
# f(1, 2, 3) # (runtime) TypeError: f() takes at most 2 arguments (3 given)
# f(b=10, 23) # SyntaxError: non-keyword arg after keyword arg
# def g(a=23, b):
# pass
# SyntaxError: non-default argument follows default argument
Functions with unknown number of argumerns
- sum(a, b, c, ...)
- reduce(function, a, b, c, ...)
- report (function, foo = 23, bar = 19, moo = 70, ...)
- report (function, a, b, c, ..., foo = 23, bar = 19, moo = 70, ...)
Variable length argument list with * and **
****args**kwargs
def f(a, b=1, *args, **kwargs):
print('a: ', a)
print('b: ', b)
print('args: ', args)
print('kwargs:', kwargs)
return a + b
f(2, 3, 4, 5, c=6, d=7)
print()
f(2, c=5, d=6)
print()
f(10)
a: 2
b: 3
args: (4, 5)
kwargs: {'c': 6, 'd': 7}
a: 2
b: 1
args: ()
kwargs: {'c': 5, 'd': 6}
a: 10
b: 1
args: ()
kwargs: {}
Passing arguments as they were received (but incorrectly)
What if we need to pass the list of individual arguments (or pairs) to another function?
def f(*args, **kwargs):
print('f args: ', args)
print('f kwargs: ', kwargs)
g(args, kwargs)
def g(*args, **kwargs):
print('g args: ', args)
print('g kwargs: ', kwargs)
f(1, 2, a=3, b=4)
f args: (1, 2)
f kwargs: {'a': 3, 'b': 4}
g args: ((1, 2), {'a': 3, 'b': 4})
g kwargs: {}
g() received 2 individual parameters, the first was a tuple, the second a dictionary
Unpacking args before passing them on
def f(*args, **kwargs):
print('f: ', args)
print('f: ', kwargs)
g(*args, **kwargs)
def g(*args, **kwargs):
print('g: ', args)
print('g: ', kwargs)
f(1, 2, a=3, b=4)
f: (1, 2)
f: {'a': 3, 'b': 4}
g: (1, 2)
g: {'a': 3, 'b': 4}
Exercise: implement the my_sum function
- my_sum should be able to accept any number of values and return their sum.
- my_sum() should return 0 or None. Decide yourself!
- my_sum(2, 3) should return 5. etc.
Solution: implement the my_sum function
def my_sum(*numbers):
s = 0
for n in numbers:
s += n
return s
print(my_sum()) # 0
print(my_sum(2, 3)) # 5
print(my_sum(-1, 2, -1,)) # 0
Exercise: implement the reduce function
my_reduce(function, a, b, c, ...)
- 'function' is expected to be a function that receives two arguments and returns a result.
- If only the function is given, return None.
- If only one value is given, return that value.
- Take the first two values, run the function on them. Then take the result and the next value and run the function on them. etc. When no more values are left, return the last result.
# print(my_reduce()) # TypeError: my_reduce() takes at least 1 argument (0 given)
print(my_reduce(lambda x,y: x+y)) # None
print(my_reduce(lambda x,y: x+y, 3)) # 3
print(my_reduce(lambda x,y: x+y, -1, 4, -2)) # 1
print(my_reduce(lambda x,y: x*y, -1, 4, -2)) # 8
Soluton: implement the reduce function
def my_reduce(f, *args):
if len(args) == 0:
return None
result = args[0]
for i in range(1, len(args)):
result = f(result, args[i])
return result
# print(my_reduce()) # TypeError: my_reduce() takes at least 1 argument (0 given)
print(my_reduce(lambda x,y: x+y)) # None
print(my_reduce(lambda x,y: x+y, 3)) # 3
print(my_reduce(lambda x,y: x+y, -1, 4, -2)) # 1
print(my_reduce(lambda x,y: x*y, -1, 4, -2)) # 8
Exercise: sort pairs
Create a function called sort_pairs, that would receive a sorting method, e.g. the word 'keys' or the word 'values' and will receive an arbitrary number of key-value pairs and will return a list of tuples.
sort_pairs( 'keys', foo = 23, bar = 47)
[('bar', 47), ('foo', 23)]
sort_pairs( 'values', foo = 23, bar = 47)
[('foo', 23), ('bar', 47)]
Solution: sort pairs
def sort_pairs(how, **kwargs):
if how == 'keys':
sort_function = lambda s : s[0];
elif how == 'values':
sort_function = lambda s : s[1];
else:
raise Exception("Invalid sort function")
return sorted(kwargs.items(), key=sort_function)
k = sort_pairs( 'keys', foo = 23, bar = 47)
print(k)
v = sort_pairs( 'values', foo = 23, bar = 47)
print(v)
Python Packages
Why Create package
As a module gets larger and larger it will be more and more difficult to maintain.
It might be eaier if we split it up into multiple files and put those files inside a directory. A 'package' is just that. A bunch of Python modules that belong together and are placed in a directory hierarchy. In order to tell Python that you really mean these files to be a package one must add a file called init.py in each directory of the project. In the most simple case the file can be empty.
- Code reuse
- Separation of concerns
- Easier distribution
Create package
- init.py
mymath/
__init__.py
calc.py
...
internal_use.py
def add(x, y):
return x+y
# empty
Internal usage
import calc
print(calc.add(7, 8)) # 15
from calc import add
print(add(3, 5)) # 8
cd examples/package
python 1/mymath/internal_use.py
use module in package - relative path
import sys
import os
path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '1')
# print(path) # /home/gabor/work/slides/python-programming/examples/package/1
sys.path.insert(0, path)
import mymath.calc
print(mymath.calc.add(2, 5))
from mymath.calc import add
print(add(2, 3))
7
5
use package (does not work)
import sys
import os
sys.path.insert(0, os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'1' ) )
import mymath
print(mymath.calc.add(4, 7))
Traceback (most recent call last):
File "use_project/proj1_2.py", line 9, in <module>
print(mymath.calc.add(4, 7))
AttributeError: module 'mymath' has no attribute 'calc'
If we import the main package name, it does not have access to the module inside.
package importing (and exporting) module
- init.py
Put import (and thus re-export) in init.py
def add(x, y):
return x+y
import mymath.calc
use package (module) with import
Still works...
import sys
import os
path = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '2' )
# print(path)
sys.path.insert(0, path)
import mymath.calc
print(mymath.calc.add(2, 5)) # 7
from mymath.calc import add
print(add(2, 3)) # 5
use package with import
Now we can import the module from the package and use that.
import sys
import os
sys.path.insert(0, os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'2' ) )
import mymath
print(mymath.calc.add(4, 7)) # 11
from mymath import calc
print(calc.add(5, 9)) # 14
Creating an installable Python package
The directory layout of a package:
├── mymath
│ ├── calc.py
│ └── __init__.py
└── setup.py
from setuptools import setup
setup(name='mymath',
version='0.1',
description='The best math library',
url='http://github.com/szabgab/mymath',
author='Foo Bar',
author_email='foo@bar.com',
license='MIT',
packages=['mymath'],
zip_safe=False,
)
Create tar.gz file
$ python setup.py sdist
- mymath.egg-info/
- dist/mymath-0.1.tar.gz
running sdist
running egg_info
creating mymath.egg-info
writing mymath.egg-info/PKG-INFO
writing top-level names to mymath.egg-info/top_level.txt
writing dependency_links to mymath.egg-info/dependency_links.txt
writing manifest file 'mymath.egg-info/SOURCES.txt'
reading manifest file 'mymath.egg-info/SOURCES.txt'
writing manifest file 'mymath.egg-info/SOURCES.txt'
warning: sdist: standard file not found: should have one of README, README.txt
creating mymath-0.1
creating mymath-0.1/mymath
creating mymath-0.1/mymath.egg-info
making hard links in mymath-0.1...
hard linking setup.py -> mymath-0.1
hard linking mymath/__init__.py -> mymath-0.1/mymath
hard linking mymath.egg-info/PKG-INFO -> mymath-0.1/mymath.egg-info
hard linking mymath.egg-info/SOURCES.txt -> mymath-0.1/mymath.egg-info
hard linking mymath.egg-info/dependency_links.txt -> mymath-0.1/mymath.egg-info
hard linking mymath.egg-info/not-zip-safe -> mymath-0.1/mymath.egg-info
hard linking mymath.egg-info/top_level.txt -> mymath-0.1/mymath.egg-info
Writing mymath-0.1/setup.cfg
creating dist
Creating tar archive
removing 'mymath-0.1' (and everything under it)
Install Package
- pip
- easy_install
$ pip install dist/mymath-0.1.tar.gz
$ easy_install --prefix ~/python/ dist/mymath-0.1.tar.gz
$ python setup.py install --prefix ~/python/
Upload to PyPi or distribute to your users.
Dependencies
requires=[
'lawyerup',
],
To list them
$ python setup.py --requires
In the setup.py file we only need to change the version number and we can release a new version of the package.
Add README file
.
├── bin
│ ├── runmymath.bat
│ └── runmymath.py
├── MANIFEST.in
├── mymath
│ └── test
│ ├── __init__.py
│ ├── test_all.py
│ └── test_calc.py
├── README.rst
└── setup.py
mymath
------
Super awesome Python module to compute the sum of numbers.
To use:
import mymath
mymath.sum(1, 2, 3)
include README.rst
Add README file (setup.py)
In the setup.py add the following function:
def readme():
with open('README.rst') as f:
return f.read()
and in the setup() call include the following parameter:
long_description=readme(),
This will display the README file when called at
$ python setup.py --long-description
Include executables
root/
setup.py
README.rst
MANIFEST.in
bin/
runmymath.py
runmymath.bat
mymath/
__init__.py
calc.py
import mymath
def main():
print("running")
main()
{% embed include file="src/examples/package/3/bin/runmymath.bat)
setup.py will need to get
scripts=['bin/runmymath.py', 'bin/runmymath.bat'],
Add tests
- unittest
- discover
root/
setup.py
README.rst
MANIFEST.in
bin/
runmymath.py
runmymath.bat
mymath/
__init__.py
calc.py
test/
__init__.py
test_all.py
test_calc.py
#empty (needed for unittest discover)
python mymath/test/test_calc.py
python mymath/test/test_all.py
python -m unittest discover
Add tests calc
from os.path import dirname,abspath
import sys
sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
from mymath.calc import add
import unittest
class AddTest(unittest.TestCase):
def test_add(self):
self.assertEqual(add(2, 3), 5)
self.assertEqual(add(2, -2), 0)
#self.assertEqual(add(1, 1), 1)
if __name__ == '__main__':
unittest.main()
Add tests all
from os.path import dirname,abspath
import sys
sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
from mymath.calc import *
import unittest
class AllTest(unittest.TestCase):
def test_sum(self):
self.assertEqual(add(2, 3), 5)
#self.assertEqual(sum(1, 1), 2)
#self.assertEqual(div(6, 2), 3)
if __name__ == '__main__':
unittest.main()
setup.py
from setuptools import setup
def readme():
with open('README.rst') as f:
return f.read()
setup(name='mymath',
version='0.2',
description='The best math library',
url='http://github.com/szabgab/mymath',
author='Foo Bar',
author_email='foo@bar.com',
license='MIT',
packages=['mymath'],
zip_safe=False,
requires=[
'lawyerup',
],
long_description=readme(),
scripts=['bin/runmymath.py', 'bin/runmymath.bat'],
)
Run tests and create package
python setup.py test
python setup.py sdist
Exercise: package
-
Go to Pypi, find some interesting module and install it in a non-standard location (or in a virtualenv)
-
Check if it was installed (try to import it in a python script).
-
Take one of the previously created modules, and create a package for it.
-
Install this new package in a non-standard location.
-
Check if it works from some other place in your file-system.
-
Take the mymath package, add another method, add tests and create the distubtable zip file.
Exercise: create executable
- Go over some of the examples in the course and package that.
- Package a script using some of your favorite modules.
Distribution of Python code
Distribution demo 1
def whoami():
print(__file__)
if __name__ == "__main__":
whoami()
from distutils.core import setup
setup(
name='demo1',
version='1.0',
)
- Install from the current folder
pip install .
- Use it on the command line: (try it in a different folder!)
python -m demo1
/home/gabor/venv3/lib/python3.10/site-packages/demo1.py
- Use it in the interactive shell
python
>>> import demo1
>>> demo1.whoami()
- Uninstall (without asking questions)
pip uninstall demo1 --yes
build/
demo1.egg-info/
Distribution demo 2
- The name of the package (demo2a in setup.py) and the name of the module (the filename demo2b.py) don't neet to be the same.
- The name of the folder (demo2)
def whoami():
print(__file__)
if __name__ == "__main__":
whoami()
from distutils.core import setup
setup(
name='demo2a',
version='1.0',
)
- Install:
pip install .
- Use the name of the module
python -m demo2b
- Uninstall using the package name
pip uninstall demo2a --yes
Distribution demo 3
One package with multiple python files
Distribute Python application as an exe
Packaging applications (creating executable binaries)
-
py2exe
-
Freeze
-
py2app
-
cx_Freeze
-
PyInstaller
-
py2exe on Windows (discontinued)
-
Freeze on Linux
-
py2app on Mac
-
cx_Freeze cross-platform
-
PyInstaller cross-platform
Using PyInstaller
print("hello world")
pip install pyinstaller
pyinstaller myscript.py
pyinstaller --onefile hello_world.py
- See the results in dist/
Other PyInstaller examples
Use this to see where does the packaged version of our code look for modules:
import sys
print(sys.path)
Use this to see how to pass command line parameters to the packaged exe:
import sys
print(sys.argv)
Other
pyinstaller --onefile --windowed myscript.py
Py2app for Mac
pip install py2app
py2applet examples/other/hello.py
Ctypes
ctypes - hello
- ctypes
#include <stdio.h>
char * echo(char * what)
{
return what;
}
int add_int(int a, int b)
{
int sum = a+b;
return sum;
}
int add_int(int a, int b)
{
int sum = a+b;
return sum;
}
int main(void)
{
printf("hello\n");
printf("%d\n", add_int(2, 3));
printf("%s\n", echo("Foo"));
return 0;
}
gcc -o hello hello.c
gcc -o hello.so -shared -fPIC hello.c
from ctypes import cdll
from ctypes import c_char_p
hello_lib = cdll.LoadLibrary("hello.so")
print(hello_lib.add_int(4, 5)) # 9
print(hello_lib.echo('Hello World')) # 153977204
hello_lib.echo.restype = c_char_p
print(hello_lib.echo('Hello World')) # Hello World
concat
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int len(char * s)
{
return strlen(s);
}
char * concat(char * a, char * b)
{
char * res;
int leng = strlen(a) + strlen(b);
res = (char *)malloc(leng);
strcpy (res, a);
strcat (res, b);
return res;
}
int main(void)
{
printf("concat\n");
printf("%d\n", len("abc"));
printf("%d\n", len(""));
printf("%d\n", len("xxxxxxxxxx"));
printf("%s\n", concat("Foo1", "Bar"));
return 0;
}
from ctypes import cdll
from ctypes import c_char_p
more_lib = cdll.LoadLibrary("more.so")
print(more_lib.len("abcd")) # 4
print(more_lib.len("")) # 0
print(more_lib.len("x" * 123)) # 123
more_lib.concat.restype = c_char_p
print(more_lib.concat("abc", "def"))
links
2to3
Convertig from Python 2 to Python 3
- 2to3
from future import ...
division
- division
print 3/2 # 1
from __future__ import division
print 3/2 # 1.5
print in Python 2
fname = 'Foo'
lname = 'Bar'
print("Name: %s %s" % (fname, lname))
print("Name: {} {}".format(fname, lname))
print(fname, lname)
print fname, lname
Name: Foo Bar
Name: Foo Bar
('Foo', 'Bar')
Foo Bar
print in Python 3
print now requires print()
from __future__ import print_function
fname = 'Foo'
lname = 'Bar'
print("Name: %s %s" % (fname, lname))
print("Name: {} {}".format(fname, lname))
print(fname, lname)
Name: Foo Bar
Name: Foo Bar
Foo Bar
input and raw_input
- raw_input
- input
raw_input() was renamed to input()
In Python 2 raw_input() returned the raw string. input(), on the other hand ran eval(raw_input())
which meant it tried to execute the input string as a piece of Python code. This was dangerous and was not really used.
In Python 3 raw_input() is gone. input() behaves as the old raw_input() returning the raw string. If you would like to get the old, and dangerous, behavior of input() you can call eval(input()).
Code that works on both 2 and 3
import platform
def my_input(text):
if platform.python_version_tuple()[0] == 3:
return input(text)
else:
return raw_input(text)
Compare different types
x = 3
y = '3'
# Python 2 Python 3
print( x > y ) # False TypeError: unorderable types: int() > str()
print( x < y ) # True TypeError: unorderable types: int() < str()
print( x == y ) # False False
Octal numbers
Octal numbers in 2.x was 011 in 3.x is: 0o11
2to3 Resources
- python3porting book
- wiki
- Dive into Python 3
- The future module
- The third-party future module
- The six module
- docs of 2to3
Design Patterns
What are Design Patterns?
Not all the Design Patterns discussed for Java or C++ are interesting, relevant or even needed in Python. Design Patterns are formal descriptions of how people do things, and not how you should do things. The formal description makes it easy to talk about them.
Some of the DPs exists to overcome problems in that specific language. Oher DPs are more general, solving classes of problem that are generic.
Don't replace built-in objects
import sys
print = 'hello'
sys.stdout.write(print)
sys.stdout.write('\n')
pip install flake8-builtins
flake8 --ignore= replace_print.py
replace_print.py:3:1: A001 "print" is a python builtin and is being shadowed, consider renaming the variable
Facade - simple interface to complex system
Facade, a structural design pattern. - Provide a simple interface (maybe a single class with few methods) to some complex system behind it. This gives flexibility for the implementation of the complex system while users gain simplicity in using it in certain subsets of operations.
os.path.basename, os.path.dirname are faced for os.path.split + indexing in the list
os.path.basename = os.path.split()[-1]
os.path.split = split with os.sep
os.path.join(names) = os.sep.join(names)
os.path.isdir(path) = stat.S_ISDIR(os.stat(path))
Monkey Patching
import real_class
class faker(object): pass
fake = faker
real_class.time = fake
fake.sleep =
fake.time =
- handy in emergencies
- easily abused for NON-emergencies - gives dynamic languages a bad name
- subtle hidden "communication" via secret obscure pathways (explicit is better)
class Monkey:
def __init__(self, count):
self.bananas = count
def is_hungry(self):
hungry = True
if hungry:
self.eat()
def eat(self):
self.bananas -= 1
m = Monkey(10)
print(m.bananas) # 10
print(m.is_hungry()) # None
print(m.bananas) # 9
Monkey.eat = lambda self: True
om = Monkey(10)
print(om.bananas) # 10
print(om.is_hungry()) # None
print(om.bananas) # 10
Creation DPs "Just One"
we want just one instance to exist
- Singleton - subclassing can never be really smooth
- Use a module instead of a class (no inheritance, no special methods)
- make just one instance (self discipline, no enforcement), need to decide to "when" (in which part if the code) to make it
- monostate (borg)
Singleton
class Singleton(object):
def __new__(cls, *a, **kw):
if not hasattr(cls, '_inst'):
cls._inst = super(Singleton, cls).__new__(*a, **kw)
return cls._inst
the problem
class Foo(Singleton): pass
class Bar(Foo): pass
f = Foo()
b = Bar()
# what class is b now? is that a Bar or a Foo instance?
Monostate (Borg)
class Monostate(object):
_shared_state = {}
def __new__(cls, *a, **kw):
obj = super(Monostate, cls).__new__(*a, **kw)
obj.__dict__ = _shared_state
return obj
class Foo(Monostate) pass
class Bar(Foo) pass
f = Foo()
b = Bar()
Better than singleton, data overriding to the rescue: But what if two calls to the constructor provide different initial data?
Dispatch table
calls = []
calls.append( lambda x: x+1 )
calls.append( lambda x: x*2 )
others = [
lambda x: x-1,
lambda x: 0
]
def do_something( call_list ):
for c in call_list:
print(c(3))
do_something( calls )
do_something( others )
Python Pitfalls
Reuse of existing module name
import random
print(random.random())
$ python examples/pitfalls/random.py
Traceback (most recent call last):
File "examples/pitfalls/random.py", line 1, in <module>
import random
File ".../examples/pitfalls/random.py", line 3, in <module>
print(random.random())
TypeError: 'module' object is not callable
- Write an example to use random number and call your example number.py
- Same with any other module name.
- Lack of multi-level namespaces
- Solution: user longer names. Maybe with project specific names.
Use the same name more than once
class Corp(object):
people = []
def add(self, name, salary):
Corp.people.append({ 'name': name, 'salary' : salary})
def total(self):
self.total = 0
for n in Corp.people:
self.total += n['salary']
return self.total
c = Corp()
c.add("Foo", 19)
print(c.total())
c.add("Bar", 23)
print(c.total())
$ python examples/pitfalls/corp.py
19
Traceback (most recent call last):
File "examples/pitfalls/corp.py", line 19, in <module>
print(c.total())
TypeError: 'int' object is not callable
Compare string and number
x = 2
y = "2"
print(x > y)
print(x < y)
Python 2 - compares them based on the type of values (wat?)
$ python examples/pitfalls/compare.py
False
True
Python 3 - throws exception as expected.
$ python3 examples/pitfalls/compare.py
Traceback (most recent call last):
File "examples/pitfalls/compare.py", line 4, in <module>
print(x > y)
TypeError: unorderable types: int() > str()
Compare different types
x = 2
y = "2"
print(x == y)
with open(__file__) as fh:
print(fh == x)
In both Python 2 and Pyhton 3 these return False
import sys
hidden = 42 # would be random
if sys.version_info.major < 3:
guess = raw_input('Your guess: ')
else:
guess = input('Your guess: ')
if hidden == guess:
print("Match!")
Will never match. Even if user types in 42. - Hard to debug and understand as there is no error.
Sort mixed data
from __future__ import print_function
mixed = [10, '1 foo', 42, '4 bar']
print(mixed) # [100, 'foo', 42, 'bar']
mixed.sort()
print(mixed) # [42, 100, 'bar', 'foo']
In Python 2 it "works" is some strange way.
$ python examples/pitfalls/sort.py
[10, '1 foo', 42, '4 bar']
[10, 42, '1 foo', '4 bar']
In Python 3 in correctly throws an exception.
air:python gabor$ python3 examples/pitfalls/sort.py
[10, '1 foo', 42, '4 bar']
Traceback (most recent call last):
File "examples/pitfalls/sort.py", line 5, in <module>
mixed.sort()
TypeError: unorderable types: str() < int()
Linters
Static Code Analyzis - Linters
-
lint
-
PEP8
-
Flake8
-
Pylint
PEP8
- pep8
pip install pep8
F811 - redefinition of unused
- flake8
import subprocess
import datetime
import sys
from datetime import datetime
$ flake8 importer.py
importer.py:4:1: F811 redefinition of unused 'datetime' from line 2
Warn when Redefining functions
- pylint
sum = 42
def len(thing):
print(f"Use {thing}.__len__() instead!")
len("abc")
pylint redef.py
************* Module redef
redef.py:1:0: C0111: Missing module docstring (missing-docstring)
redef.py:2:0: W0622: Redefining built-in 'sum' (redefined-builtin)
redef.py:4:0: W0622: Redefining built-in 'len' (redefined-builtin)
redef.py:2:0: C0103: Constant name "sum" doesn't conform to UPPER_CASE naming style (invalid-name)
redef.py:4:0: C0111: Missing function docstring (missing-docstring)
--------------------------------------------------------------------
Your code has been rated at -2.50/10 (previous run: -2.50/10, +0.00)
Signals
Signals and Python
-
kill
-
man 7 signal (on Linux)
-
Unix: kill PID, kill -9 PID, Ctrl-C, Ctrl-Z
-
os.kill
Sending Signal
- kill
import signal
import os
print("before")
os.kill(os.getpid(), signal.SIGUSR1)
print("after")
before
User defined signal 1: 30
Catching Signal
import signal
import os
def handler(signum, frame):
print('Signal handler called with signal', signum)
signal.signal(signal.SIGUSR1, handler)
print("before")
os.kill(os.getpid(), signal.SIGUSR1)
print("after")
before
('Signal handler called with signal', 30)
after
Catching Ctrl-C on Unix
username = input('Username:')
print(username)
$ python ctrl_c.py
{% embed include file="src/examples/signals/ctrl_c.out)
import signal
def handler(signum, frame):
print('Signal handler called with signal', signum)
signal.signal(signal.SIGINT, handler)
username = input('Username:')
print(username)
- Cannot stop using Ctrl-C !
- Ctrl-Z and then kill %1
- kill PID
Catching Ctrl-C on Unix confirm
import signal
import time
def handler(signum, frame):
answer = input('We are almost done. Do you really want to exit? [yes]:')
if answer == 'yes':
print('bye')
exit()
print("Then let's keep running")
signal.signal(signal.SIGINT, handler)
for _ in range(10):
time.sleep(5)
Alarm signal and timeouts
import signal
class MyTimeout(Exception):
pass
def handler(signum, frame):
print('Signal handler called with signal', signum)
raise MyTimeout
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(5)
number = input("Divide by (5 sec):")
signal.alarm(0)
print(42/int(number))
except MyTimeout:
print('timeout')
except Exception as e:
print(e)
#raise
print("Still working")
Exercise: Catching Ctrl-C on Unix 2nd time
- When Ctrl-C is pressed display: "In order to really kill the application press Ctrl-C again" and keep running. If the user presses Ctrl-C again, then let id die.
- Improve the previous that if 5 sec within the first Ctrl-C there is no 2nd Ctrl-C then any further Ctrl-C will trigger the above message again.
Exercise: Signals
- What signal is sent when you run kill PID?
- Write a script that will disable the kill PID for your process. How can you kill it then?
- What signal is sent when we press Ctrl-Z ?
Ctrl-z
import signal
import os
print(os.getpid())
username = input('Username:')
print(username)
kill PID
import signal
import os
print(os.getpid())
def handler(signum, frame):
print('Signal handler called with signal', signum)
signal.signal(signal.SIGTERM, handler)
username = input('Username:')
print(username)
Data Science
Data Scince Resources
-
Machine Learning with Andrew Ng.
-
Data Scientist with Python on DataCamp.
-
[Stanford cs231n])http://cs231n.stanford.edu/)
-
Pandas profiling
FAQ
How not to name example scirpts?
Don't - by mistake - call one of your files the same as a module you will be loading.
For example random.py is a bad idea if you will import random.
Your code will try to locate random.py to load, but will find itself and not the one that comes with Python.
Python will also create a random.pyc file - a compiled file - and it will take time till you recall this and delete that too. Till then the whole thing will seem to be broken.
Platform independent code
In general Python is platform independent, but still needs some care to make sure you don't step on some aspects of Operating System or the file system that works differently on other OS-es.
- Filenames are case sensitive on some OS-es (e.g. Windows). They used to be restricted to 8.3. Make sure you are within the restriction of every OS you might want to use.
- Directory path: (slash or backslash or something else?) use the os.path methods.
- os.path.expanduser('~') works on both Linux and Windows, but the root of a Linux/Unix file system starts with a slash (/) and on Windows it is c:\ and d:\ etc.
- On Linux/Unix you have user 'root' and on Windows 'Administrator'
- File permissions are different on Linux and Windows.
- Stay away from OS specific calls, but as a last resort use os.name or sys.platform to figure out which os is this. os.name is 'posix' on Linux and 'nt' on Windows.
- For GUI use wxWindows that has a native look on Windows and Gnome look on Linux.
- Pay attention to any 32/64 bit issues. Big/Little Endian issues.
- Some modules might be OS specific. Check the documentation.
- Pay attention to the use of os.system and subsystem modules.
How to profile a python code to find causes of slowness?
Use one of these modules:
- cProfile is in C. It is faster and preferable.
- profile
pdb = Python Debugger
- pdb
Include the following code in your script at any point, and run the script as you'd do normally. It will stop at the given point and enter the debugger.
import pdb; pdb.set_trace()
Avoid Redefining functions
Can I tell python to stop compilation when someone is redefining a function? Or at least give me a warning?
Use pylint for that
Algorithm
Exercise: Find the odd value
-
Given a list of values, we know that every value comes in pairs except one. Find where it is:
-
f(["a", "a", "b", "b", "c", "d", "d"]) would return 4
Solution: Find the odd value
def find_odd(values):
'''
>>> find_odd(['c'])
0
>>> find_odd(['c', 'x', 'x'])
0
>>> find_odd(['x', 'x', 'c'])
2
>>> find_odd(['x', 'x', 'c', 'y', 'y'])
2
>>> find_odd(['a', 'a', 'b', 'b', 'd', 'd', 'x', 'x', 'c', 'y', 'y'])
8
>>> find_odd(['a', 'a', 'c', 'b', 'b', 'd', 'd', 'x', 'x', 'y', 'y'])
2
'''
if len(values) % 2 == 0:
raise Exception("Number of elements must not be divisible by 2")
start = 0
end = len(values) - 1
while True:
if end - start < 2:
return start
if start > end:
raise Exception("We have a problem")
middle = start + int((end-start)/2)
middle -= middle % 2
if middle < 0:
middle += 2
#return middle
if values[middle] == values[middle+1]:
#return 'a'
start = middle+2
else:
#return 'b'
end = middle
# To verify run
# pytest --doctest-modules find_the_odd_value.py
Exercise: Generalized find the odd value
-
Given a list of values, we know that every value comes in groups of N except one group that has less than N element. Given the list and the number N find where it starts:
-
f(["a", "a", "a", "b", "b", "b", "x", "d", "d", "d"], 3) would return 6
-
f(["a", "a", "a", "b", "b", "b", "x", "y", "d", "d", "d"], 3) would return 6
Solution: Generlized Find the odd value
def find_odd(values, size=2):
'''
>>> find_odd(['c'])
0
>>> find_odd(['c', 'x', 'x'])
0
>>> find_odd(['x', 'x', 'c'])
2
>>> find_odd(['x', 'x', 'c', 'y', 'y'])
2
>>> find_odd(['a', 'a', 'b', 'b', 'd', 'd', 'x', 'x', 'c', 'y', 'y'])
8
>>> find_odd(['a', 'a', 'c', 'b', 'b', 'd', 'd', 'x', 'x', 'y', 'y'])
2
>>> find_odd(['c'], 3)
0
>>> find_odd(['c', 'd'], 3)
0
>>> find_odd(['c', 'x', 'x', 'x'], 3)
0
>>> find_odd(['c', 'd', 'x', 'x', 'x'], 3)
0
>>> find_odd(['x', 'x', 'x', 'c', 'd'], 3)
3
>>> find_odd(['x', 'x', 'x', 'c', 'd', 'y', 'y', 'y'], 3)
3
>>> find_odd(['a', 'a', 'a', 'b', 'b', 'b', 'd', 'd', 'd', 'x', 'x', 'x', 'c', 'y', 'y', 'y'], 3)
12
>>> find_odd(['a', 'a', 'a', 'b', 'b', 'b', 'd', 'd', 'd', 'x', 'x', 'x', 'c', 'q', 'y', 'y', 'y'], 3)
12
>>> find_odd(['a', 'a', 'a', 'c', 'b', 'b', 'b', 'd', 'd', 'd', 'x', 'x', 'x', 'y', 'y', 'y'], 3)
3
>>> find_odd(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'z', 'd', 'd', 'd', 'x', 'x', 'x', 'y', 'y', 'y'], 3)
6
'''
if len(values) % size == 0:
raise Exception(f"Number of elements must not be divisible by {size}")
start = 0
end = len(values) - 1
while True:
if end - start < size:
return start
if start > end:
raise Exception("We have a problem")
middle = start + int((end-start)/size)
middle -= middle % size
if middle < 0:
middle += size
#return middle
if all(map(lambda val: values[middle] == val, values[middle+1:middle+size])):
#return f'a {middle}'
start = middle+size
else:
end = middle
# To verify run
# pytest --doctest-modules generalized_find_the_odd_value.py
Exercise: Shortest sublist with sum over limit
-
Given a list of integers [10, 12, 35, 7] and a number e.g. 25 return the length of the shortests sublist where the sum of the numbers is greater than or equal to the given number. If no such sublist can be found return -1.
-
A few examples:
>>> shortest([], 7)
-1
>>> shortest([2, 3], 7)
-1
>>> shortest([2, 3], 0)
0
>>> shortest([], 0)
0
>>> shortest([7, 3], 7)
1
>>> shortest([4, 7, 3], 7)
1
>>> shortest([1, 23, 1, 1, 10, 11, 12], 30)
3
>>> shortest([1, 23, 1, 1, 10, 11, 12], 24)
2
>>> shortest([1, 10, 11, 40], 30)
1
Solution: Shortest sublist with sum over limit
def shortest(numbers, limit):
'''
>>> shortest([], 7)
-1
>>> shortest([2, 3], 7)
-1
>>> shortest([2, 3], 0)
0
>>> shortest([], 0)
0
>>> shortest([7, 3], 7)
1
>>> shortest([4, 7, 3], 7)
1
>>> shortest([1, 23, 1, 1, 10, 11, 12], 30)
3
>>> shortest([1, 23, 1, 1, 10, 11, 12], 24)
2
>>> shortest([1, 10, 11, 40], 30)
1
'''
if limit == 0:
return 0
length = None
start = 0
end = -1
total = 0
while True:
#start < len(numbers) and end <= len(numbers) and start < end:
if total >= limit:
if length is None:
length = 1 + end-start
else:
length = min(length, 1 + end-start)
total -= numbers[start]
start += 1
if start > end:
break
else:
end += 1
if end >= len(numbers):
break
total += numbers[end]
return -1 if length is None else length
# To verify run
# pytest --doctest-modules shortest_sublist.py
Refactor
Refactoring example - change variable name
data = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn']
for i in data:
print(i)
celestical_objects = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn']
for planet in celestical_objects:
print(planet)
How to Refactor
- Write tests that will verify the behaviour (or at least compare the new behavior to the old behavior)
- Make a small change.
- Run the tests.
Exercise: Fix deep indentation
import re
import sys
print("Welcome to D3L3.1415 TELEphone InDX. Please wait while we fetch all phones in the document")
if len(sys.argv) != 2:
print("Invalid argument number. D3L3.1415 rules are for your own good, please try again")
exit()
phone = []
print('the TELEphone numbers are: ')
path = sys.argv[1]
with open(path, 'r') as fh:
for line in fh:
match = re.search(r' .+-.+', line)
if match:
splinter = match.group(0).split()
for check in splinter:
b = list(check)
a = b[len(b) - 1]
if (ord(a) >= 48) and (ord(a) <= 57):
phone.append(check)
for tele in phone:
print(tele)
import sys
import os
def getinput():
string = []
if len(sys.argv)<2:
exit(f'USE: {sys.argv[0]} FILE'.center(40,' -'))
elif os.path.isfile(sys.argv[1]):
file = sys.argv[1]
else:
file = 0
if file:
with open(file) as f:
for line in f:
string.append(line.strip('\n'))
try:
string = ''.join(string)
except Exception:
exit('EXCEPTION IN INPUT'.center(40,' -'))
else:
string = sys.argv[1]
return(string)
#def funique(items):
# unique = []
#
# filt = [' ', '.', ',', ':']
# items = list(filter(lambda x : x not in filt ,items))
#
# for item in items:
# if item.split()[0] not in unique:
# unique.append(item.split()[0])
# return(unique)
# def count(unique,items):
#
# count = [0]*len(unique)
# print(count)
# diction = {unique[i]:count[i] for i in range(len(unique))}
# print(type(diction))
# for u in unique:
# print('from count unique: ',u)
# for item in items:
# print('from count item: ',item)
# if u == item:
# print(True)
# print(type(u))
# print(diction['A'],diction['T'])
# diction[u][0] += 1
# print(id(diction['A']),id(diction['T']))
# print(diction)
# return
# return(diction)
# In[162]:
def count(items):
filt = [' ', '.', ',', ':']
items = list(filter(lambda x : x not in filt ,items))
diction = {}
for item in items:
if item in diction.keys():
diction[item] += 1
else:
diction[item] = 1
return(diction)
def out(diction):
sort = sorted(d)
summ = sum(diction[x] for x in diction.keys())
for key in sort:
percent = (d[key]/summ)*100
print(f'{key:<2}{d[key]:<3}-{percent:>6.2f} %')
items = getinput()
d = count(items)
out(d)
Overview of Python syntax
Scalars
- Numbers (int, float)
- Strings (str)
- Boolean
Numbers
a = 23
b = 2.3
c = a + b
d = a - b
e = a * b
f = a / b
g = a // b # int(a/b)
m = a % 7 # modulo
x = a ** 2 # exponent
Strings
a = "double quote"
b = 'single quote'
c = """
multi
line
can use either single or double quotes
"""
d = f"f-string with {a} or with {b}"
e = r"\raw\string\to\keep\backslashes"
x = a + b
var[3]
var[3:7]
len(var)
ord(char)
chr(number)
var.title()
var.upper()
var.lower()
var.index(sub)
var.rindex(sub)
var.find(sub)
short in long
if short in long:
print('in')
':'.join(list_of_strings)
some_string.split(':')
int - float - string conversion
int()
float()
str()
Booleans
True
False
Lists
fruits = ['apple', 'banana', 'peach', 'pear']
fruits[2]
fruits[1:3]
fruits[::2]
fruits[:]
len(fruits)
import copy
copy.copy(fruits) # shallow copy
copy.deepcopy(fruits)
element in some_list
if element in some_list:
print('in')
fruits.index(sub) # return location or raises Exception
fruits.insert(location, anothe_fruit)
fruits.append(another_fruit)
fruits.remove(some_fruit) # remove by value
fruits.pop(location) # remove by location
list()
fruits.sort()
sorted(fruits)
Queue and Stack
Stack:
append
pop
Queue:
append
pop(0)
from collections import deque
Stack
fruits.append(...)
fruits.pop()
Queue
fruits = deque()
fruits.append(...)
fruits.popleft()
Dictionaries
Tuples
"inmutable list"
tuple()
fruits = ('apple', 'banana', 'peach')
Sets
set()
set(some_list)
fruits = {'apple', 'banana', 'peach'}
I/O
print(var)
print(var, end=" ", sep="")
STDIN - Standard input
input("Some question: ")
CLI
sys.argv
argparse
Control flow
- Loops
- Conditionals
- Boolean operators
- Conditional (ternary) operator
- Exceptions
While - Loops
while cond:
pass
break
continue
For - Loops
for var in some_string:
print(var)
for var in range(3, 15, 2):
print(var)
for var in some_list:
print(var)
for var in some_iterable:
print(var)
for var in some_iterable:
print(var)
else:
print("finished iterating")
Conditionals
if cond1:
pass
elif cond1:
pass
else:
pass
Comparision operators
==
!=
<
<=
>=
>
Boolean operators
and
or
not
The conditional (ternary) operator
result = this if condition else that
Random Values
import random
random.seed(42)
random.random()
random.randrange(1, 7)
random.choice(values)
random.sample(values)
Math
import math
math.pi
math.sin()
Exceptions
raise Exception("Some text")
raise ValueError("Some text")
try:
# Risky code
except Exception as err:
# Handle exception
Files
(Plain text, CSV, Excel, JSON, YAML)
Functions
Modules
bytes
Exception handling
Flake8 Pylint assert
Serialization (Marshalling)
Why Serialization is needed?
- Data transfer between processes on the same computer
- Network traffic
- Storing data for later reuse in another process
Questions to ask
- Which programming languages support it besides Python?
- Can the files be access on other operating system, other architectures, different versions of Python?
- How long does it take to store additional entry?
- How long does it take to access an entry?
- How much memory is needed? Do we need to read the whole file or can we read records?
- How much disk-space is being used for the serialized data?
Various tools for serialization
- Plain text
- CSV
- JSON
- YAML
- XML
- Matlab format savemat loadmat
- pickl (python only)
- marshal (internal usage)
- Protobuf
- HDF5 in python: h5py
- parquet in python: parquet
Serialization with h5py
- HDF5 - Hierarchical Data Format - supports n-dimensional datasets and each element in the dataset may itself be a complex object.
- docs
TODO: fix these
import h5py
import os
import sys
import numpy as np
filename = 'counter.h5'
if len(sys.argv) == 1:
if not os.path.exists(filename):
print("counter does not exist yet")
exit(1)
with h5py.File(filename, 'r') as hdf:
for name in hdf.keys():
print(f"{name}: {hdf[name][0]}")
exit()
if not os.path.exists(filename):
with h5py.File(filename, 'w') as hdf:
pass
with h5py.File(filename, 'r+') as hdf:
for name in sys.argv[1:]:
if name not in hdf:
hdf[name] = np.zeros(1, dtype=int)
hdf[name][0] += 1
print(f"{name}: {hdf[name][0]}")
import h5py
#import numpy as np
#
#original_data = []
#
#count = 10
#size = (2, 5)
filename = 'data.h5'
#
#for _ in range(count):
# row = np.random.random(size)
# print(row)
# original_data.append(row)
with h5py.File(filename, 'w') as hdf:
hdf["a"] = 23
hdf["b"] = 19
with h5py.File(filename, 'r') as hdf:
print(hdf) # <HDF5 file "data.h5" (mode r)>
print(hdf.keys()) # <KeysViewHDF5 ['a', 'b']>
for key in hdf.keys():
print(key, hdf[key])
import sys
import h5py
filename = sys.argv[1]
with h5py.File(filename, 'r') as hdf:
loaded = hdf['data'][:]
print(len(loaded))
print(type(loaded))
print(loaded.size)
print(loaded.shape)
print(type(loaded[0]))
print(loaded[0].size)
print(loaded[0].shape)
Serialization of single Numpy array
pip install numpy
pip install scipy
pip install h5py
pip install protobuf
import os
import sys
import json
import numpy as np
import h5py
import scipy.io
import pickle
def main():
size = (2, 4)
if len(sys.argv) == 3:
size = (int(sys.argv[1]), int(sys.argv[2]))
print(f"size: {size}\n")
original = np.random.random(size)
#print(original)
try_json(original)
try_pickle(original)
try_matlab(original)
try_hdf5(original)
def try_json(original):
with open('demo.json', 'w') as fh:
json.dump(original, fh, default=lambda obj: obj.tolist())
with open('demo.json') as fh:
loaded = np.array(json.load(fh)) #, default=lambda obj: obj.tolist())
#print(loaded)
assert np.array_equal(original, loaded)
print(f"json: {os.path.getsize('demo.json'):7}")
def try_pickle(original):
with open('demo.pickle', 'wb') as fh:
pickle.dump(original, fh, pickle.HIGHEST_PROTOCOL)
with open('demo.pickle', 'rb') as fh:
loaded = pickle.load(fh)
assert np.array_equal(original, loaded)
print(f"pickle: {os.path.getsize('demo.pickle'):7}")
def try_matlab(original):
scipy.io.savemat('demo.mat', {'data': original})
mat = scipy.io.loadmat('demo.mat')
loaded = mat['data']
assert np.array_equal(original, loaded)
print(f"matlab: {os.path.getsize('demo.mat'):7}")
def try_hdf5(original):
with h5py.File('demo.h5', 'w') as hdf:
hdf['data'] = original
with h5py.File('demo.h5', 'r') as hdf:
loaded = hdf['data'][:] # [:] is needed to copy the content
assert np.array_equal(original, loaded)
print(f"hdf5: {os.path.getsize('demo.h5'):7}")
main()
- try to
gzipthe JSON file and maybe also the others and see the sizes.
Serialization of multiple Numpy arrays
- hdf5 allows you to access specific array without loading the whole data structure into memory.
- Same with SQlite, but it is much bigger.
import os
import sys
import glob
import json
import sqlite3
import numpy as np
import h5py
import scipy.io
import pickle
def main():
for path in glob.glob("demo*"):
os.unlink(path)
if len(sys.argv) != 4:
exit(f"Usage: {sys.argv[0]} ROWS, COLS, COUNT")
size = (int(sys.argv[1]), int(sys.argv[2]))
count = int(sys.argv[3])
print(f"size: {size} count {count}\n")
originals = [np.random.random(size) for _ in range(count)]
#print(originals)
try_json(originals)
try_pickle(originals)
try_matlab(originals)
try_hdf5(originals)
try_hdf5_separate(originals)
try_sqlite(originals)
def try_json(originals):
with open('demo.json', 'w') as fh:
json.dump(originals, fh, default=lambda obj: obj.tolist())
with open('demo.json') as fh:
loaded = np.array(json.load(fh)) #, default=lambda obj: obj.tolist())
#print(loaded)
assert np.array_equal(originals, loaded)
print(f"json: {os.path.getsize('demo.json'):7}")
def try_pickle(originals):
with open('demo.pickle', 'wb') as fh:
pickle.dump(originals, fh, pickle.HIGHEST_PROTOCOL)
with open('demo.pickle', 'rb') as fh:
loaded = pickle.load(fh)
assert np.array_equal(originals, loaded)
print(f"pickle: {os.path.getsize('demo.pickle'):7}")
def try_matlab(originals):
scipy.io.savemat('demo.mat', {'data': originals})
mat = scipy.io.loadmat('demo.mat')
loaded = mat['data']
assert np.array_equal(originals, loaded)
print(f"matlab: {os.path.getsize('demo.mat'):7}")
def try_hdf5(originals):
with h5py.File('demo.h5', 'w') as hdf:
hdf['data'] = originals
with h5py.File('demo.h5', 'r') as hdf:
loaded = hdf['data'][:] # [:] is needed to copy the content
assert np.array_equal(originals, loaded)
#print(loaded)
print(f"hdf5: {os.path.getsize('demo.h5'):7}")
# Don't load all the data in memory when reading
def try_hdf5_separate(originals):
with h5py.File('demo.hdf5', 'w') as hdf:
hdf['data'] = originals
for ix in range(len(originals)):
with h5py.File('demo.hdf5', 'r') as hdf:
loaded = hdf['data'][ix][:] # [:] is needed to copy the content
#print(loaded)
assert np.array_equal(originals[ix], loaded)
print(f"hdf5: {os.path.getsize('demo.hdf5'):7}")
# Don't load all the data in memory when reading
def try_sqlite(originals):
conn = sqlite3.connect("demo.db")
curs = conn.cursor()
try:
curs.execute('''CREATE TABLE arrays (
id INTEGER PRIMARY KEY AUTOINCREMENT,
array BlOB NOT NULL
)''')
sql = '''INSERT INTO arrays (array) VALUES (?)'''
pickled = [pickle.dumps(arr, pickle.HIGHEST_PROTOCOL) for arr in originals]
#for arr in pickled:
# curs.execute(sql, (arr,))
# needs a list of tuples for the placeholder
curs.executemany(sql, [(arr,) for arr in pickled])
conn.commit()
except sqlite3.OperationalError as err:
print(f'sqlite error: {err.args[0]}')
conn.close()
for ix in range(1, len(originals)+1):
try:
conn = sqlite3.connect("demo.db")
curs = conn.cursor()
sql = '''SELECT array FROM arrays WHERE id == ?'''
curs.execute(sql, (ix,))
loaded = pickle.loads(curs.fetchone()[0])
except sqlite3.OperationalError as err:
print(f'sqlite error: {err.args[0]}')
exit()
assert np.array_equal(originals[ix-1], loaded)
print(f"sqlite: {os.path.getsize('demo.db'):7}")
main()
Other slides
Other slides
Some slides that used to be part of the material and they might return to be there, but for now they were parked here.
Atom for Python
Some details about the Atom editor. You can freely skip this part. Personally I don't use it now.
Autocomplete
- apm install autocomplete-python
Autocomplete
- easy_install jedi
- apm install autocomplete-plus-python-jedi
Linter
- easy_install flake8
- easy_install flake8-docstrings
- apm install linter
- apm install linter-flake8
IDLE - Integrated DeveLopment Environment
-
IDLE
-
Python shell
-
Better editing
-
Limited debugger
-
c:\Python27\Lib\idlelib\idle.bat -
C:\Users\Gabor\AppData\Local\Programs\Python\Python35\Lib\idlelib\idle.bat
sh-bang - executable on Linux/Apple
#!/usr/bin/env python
print("Hello World")
- The first line staring with # is needed if you want to have a file that can be executed without explicitly typing in python as well.
- Make your file executable: chmod u+x hello_ex.py
- Run like: ./hello_ex.py
- In order to run it as hello_ex.py in needs to be located in one of the directories listed in the PATH environment variable.
pydoc
If you really want it, you can also read some of the documentation on the command line, but unless you are locked up some place without Internet connection, I don't recommend this.
Type pydoc. On Windows, you might need to create the following file and put it in a directory in your PATH. (see echo %PATH%)
@python c:\Python27\Lib\pydoc.py %*
Spyder Intro
- iPython console (bottom right)
- Spyder-Py2 / Preferences / Console / Advanced Settings
- Save the file (Ctrl-S / Command-S)
- Run/Run (F5)
- F9 - execute selected text (e.g. we can eecute a function definition after we've changed it)
- TAB for autocomple names of already existing variables.
print("abc")
"abc". shows the available methods.
"abc".center Command-I will explain what is "center"
Interactive Debugging
def f(a, b):
c = a + b
d = a * b
return c+d
def run():
print(f(2, 3))
import code
code.interact(local=locals())
print(f(19, 23))
run()
Parameter passing
def hello(name):
msg = name + '!!!!'
print('Hello ' + msg)
hello('Foo')
hello('Bar')
Hello Foo!!!!
Command line arguments and main
import sys
def hello(name):
msg = name + '!!!!'
print('Hello ' + msg)
def main():
hello(sys.argv[1])
main()
Run as python argv.py Foo
Later we'll see the argparse module that can handle command line arguments in a better way.
Name of the current function in Python
- inspect
- currentframe
- stack
import inspect
def first():
print(inspect.currentframe().f_code.co_name)
print(inspect.stack()[0][3])
second()
def second():
print(inspect.currentframe().f_code.co_name)
print(inspect.stack()[0][3])
def main():
first()
main()
Name of the caller function in Python
- inspect
- stack
import inspect
def first():
print("in first")
print("Called by", inspect.stack()[1][3])
second()
def second():
print("in second")
print("Called by", inspect.stack()[1][3])
def main():
first()
main()
Stack trace in Python using inspect
- inspect
- stack
import inspect
def first():
second()
def second():
for info in inspect.stack():
#print(info)
#FrameInfo(
# frame=<frame at 0x1c18b18, file 'stack_trace.py', line 9, code second>,
# filename='stack_trace.py',
# lineno=8,
# function='second',
# code_context=[' for level in inspect.stack():\n'],
# index=0)
#print(info.frame)
print(info.filename)
print(info.lineno)
print(info.function)
print(info.code_context)
print('')
def main():
first()
if __name__ == '__main__':
main()
stack_trace.py
8
second
[' for info in inspect.stack():\n']
stack_trace.py
4
first
[' second()\n']
stack_trace.py
26
main
[' first()\n']
stack_trace.py
30
<module>
[' main()\n']
Getting the class name of an object
- class
- name
- type
How to find out which class an object (instance) belongs to?
import re
a = 2
b = "3"
c = 2.3
m = re.search(r'\d', str(c))
print(a.__class__) # <type 'int'>
print(b.__class__) # <type 'str'>
print(c.__class__) # <type 'float'>
print(type(a)) # <type 'int'>
print(type(b)) # <type 'str'>
print(type(c)) # <type 'float'>
print(a.__class__.__name__) # int
print(b.__class__.__name__) # str
print(c.__class__.__name__) # float
print(re.__class__.__name__) # module
print(m.__class__.__name__) # SRE_Match or Match
Circular references
circular references are cleaned up the by the garbage collector but maybe not all the memory is given back to the OS, and it can take some time to clean them up.
import time
def create_pair():
a = {'name' : 'Foo'}
b = {'name' : 'Bar'}
a['pair'] = b
b['pair'] = a
#print(a)
for i in range(1, 30000000):
create_pair()
print("let's sleep now a bit")
time.sleep(20)
but weakref might expedite the cleanup. See also the gc module and if I can show it http://stackoverflow.com/questions/2428301/should-i-worry-about-circular-references-in-python
Context managers: with (file) experiments
with open('out.txt', 'w') as h:
h.write("hello\n")
h = open('out.txt')
print(h.read())
f = open('out.txt', 'w')
f.write("hello\n")
f.close()
# for line in open("myfile.txt"):
# print line,
# the file is closed only when script ends
range vs xrange in Python
- range
- xrange
from __future__ import print_function
import sys
r = range(1000)
x = xrange(1000)
for v in r: # 0..999
pass
for v in x: # 0..999
pass
print(sys.getsizeof(r)) # 8072
print(sys.getsizeof(x)) # 40
In Python 2 range creates a list of values range(from, to, step) and xrnage creates and iterator.
In Python 3 range creates the iterator and if really necesary then list(range()) can create the list.
profile (with hotshot) slow code
It was experimental and dropped from Python 3
import slow
import os
import hotshot, hotshot.stats
prof = hotshot.Profile("slow.prof")
prof.runcall(slow.main, 1000)
prof.close()
stats = hotshot.stats.load("slow.prof")
stats.strip_dirs()
stats.sort_stats('time', 'calls')
stats.print_stats(20)
os.remove("slow.prof")
501501 function calls in 0.337 seconds
Ordered by: internal time, call count
ncalls tottime percall cumtime percall filename:lineno(function)
498501 0.192 0.000 0.192 0.000 slow.py:37(swap)
1 0.136 0.136 0.335 0.335 slow.py:21(sort)
999 0.006 0.000 0.006 0.000 slow.py:4(f)
999 0.002 0.000 0.002 0.000 random.py:173(randrange)
1 0.001 0.001 0.003 0.003 slow.py:31(get_str)
999 0.000 0.000 0.000 0.000 slow.py:10(g)
1 0.000 0.000 0.337 0.337 slow.py:14(main)
0 0.000 0.000 profile:0(profiler)
Python Descriptors
- init
- get
- set
- delete
A more manual way to implement the property() functionality we have just seen. Use cases:
-
Implement type-checking and/or value checking for attribute setters ()
Python from .NET
TODO and add to dotnet
TODO: example with async call in .NET getting back to python
Matplotlib subplot
- Generates a separate graph, but when saving to disk, the image is blank
fig, ax = plt.subplots()
ax.plot(
[ 1, 2, 3, 4 ],
[ 10, 3, 45, 5 ],
)
Jupyter StackOverflow - historgram
# Historgram of the top 20 countries
first20.hist(bins = 20)
# Plot using Seaborn
plot = sns.relplot(data = first20)
plot.set_xticklabels(rotation=90)
Jupyter StackOverflow - OpenSourcer
df['OpenSourcer'].value_counts()
df['OpenSourcer'].unique()
Jupyter StackOverflow - cross tabulation
# Crosstabulation
first10 = country_count.head(10)
subset = df[ df['Country'].isin( first10.keys() ) ]
# subset.count()
# subset['OpenSourcer'].value_counts()
grouped = subset.groupby('Country')['OpenSourcer'].value_counts()
# grouped.plot.bar(figsize=(15,15))
pd.crosstab(subset['Country'], df['OpenSourcer'])
ct = pd.crosstab(subset['Country'], df['OpenSourcer']).apply(lambda r: 100 * r/r.sum(), axis=1)
ct
ct.transpose().hist(figsize=(15, 15))
Jupyter StackOverflow - salaries
# Try to show the average salary by country
grp = df.groupby('Country').mean().round({'CompTotal' : 0})
#grp['CompTotal']
pd.set_option('display.float_format', lambda x: '{:,}'.format(x))
grp.sort_values('CompTotal', ascending=False)
Jupyter StackOverflow - replace values
nd = df.replace({'OpenSourcer' : {
'Never' : 0,
'Less than once per year' : 1,
'Less than once a month but more than once per year' : 2,
'Once a month or more often' : 3,
} })
nd
nd.describe()
nd.groupby('Country').mean().sort_values('OpenSourcer', ascending=False)
NameError
python common_error.py 42
import sys
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} Number")
if 42 < int(sys.argv[1]):
res = "bigger"
elif int(sys.argv[1]) < 42:
res = "smaller"
print(res)
# NameError: name 'res' is not defined
UnboundLocalError
python common_error_in_function.py 42
import sys
def check():
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} Number")
if 42 < int(sys.argv[1]):
res = "bigger"
elif int(sys.argv[1]) < 42:
res = "smaller"
print(res)
check()
# UnboundLocalError: local variable 'res' referenced before assignment
Insert element in sorted list using bisect
- bisect
import bisect
solar_system = ['Earth', 'Jupiter', 'Mercury', 'Saturn', 'Venus']
name = 'Mars'
# Find the location where to insert the element to keep the list sorted
loc = bisect.bisect(solar_system, name)
print(loc)
solar_system.insert(loc, name)
print(solar_system)
print(sorted(solar_system))
Gravatar in Python
import hashlib
import sys
def gravatar(email):
return hashlib.md5(email.strip().lower().encode('utf8')).hexdigest()
if len(sys.argv) != 2:
exit(f"Usage: {sys.argv[0]} EMAIL")
email = sys.argv[1]
code = gravatar(email)
print(f"https://www.gravatar.com/avatar/{code}?s=100&d=blank")
Debug with ptpython
pip install ptpython
- Then either use it as a REPL to explore code or make your application fall back into this REPL to debug your code.
import requests
from ptpython.repl import embed
res = requests.get("https://code-maven.com/")
embed(globals(), locals())
print("done")
REPL - Interactive debugging with ptpython
from ptpython.repl import embed
x = 32
embed(globals(), locals())
y = 42
print('OK')
Print in color on the CLI
- colorama
from colorama import Fore, Back, Style
print('default color text')
print(Fore.RED + 'red text' + Style.RESET_ALL)
print(Back.GREEN + 'black with green background' + Style.RESET_ALL)
print(Fore.YELLOW + Back.BLACK + 'yellow text with black background' + Style.RESET_ALL)
print('default color text')
print(Fore.RED)
print('red text')
print(Back.BLACK)
print('red text black background')
print(Style.RESET_ALL)
print('back to default color')
Easy Install
$ easy_install module_name
- Intsalling pip on Windows as well:
easy_install pipWill work on Windows as well.
easy_install -d ~/python Genshi
sorting with sorted using a key
To sort the list according to length using sorted
animals = ['snail', 'cow', 'elephant', 'chicken']
animals_in_abc = sorted(animals)
print(animals)
print(animals_in_abc)
animals_by_length = sorted(animals, key=len)
print(animals_by_length)
['snail', 'cow', 'elephant', 'chicken']
['chicken', 'cow', 'elephant', 'snail']
['cow', 'snail', 'chicken', 'elephant']
get and set locale
- locale
- LC_CTYPE
- getlocale
- setlocale
import locale
print(locale.getlocale(locale.LC_CTYPE))
locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
print(locale.getlocale(locale.LC_CTYPE))
locale.setlocale(locale.LC_CTYPE, 'en_IL.UTF-8')
print(locale.getlocale(locale.LC_CTYPE))
##locale.setlocale(locale.LC_CTYPE, 'ZH.UTF-8')
#print(locale.getlocale(locale.LC_CTYPE))
Modify time anomality
Without calling flush the modify-time of the two files will be the same. Even if we sleep 0.001 seconds. Despite the fact that the filesystem provide more accurate values.
If we we wait a bit between calls, or if we flush the buffer of the file, then the timestamps will be different.
import os
import time
with open("first.txt", "w") as fh:
fh.flush()
pass
print(f"time: {time.time()}")
#time.sleep(0.01)
with open("second.txt", "w") as fh:
pass
first = os.path.getmtime("first.txt")
second = os.path.getmtime("second.txt")
print(first)
print(second)
print("same" if first == second else "diff")
Some strange code
folder = "animals/"
image = "cat"
fname = f'{folder}images/{image}.jpg',
config = {
'file_name': fname[0],
'title': "Animals",
}
print(config)
is vs ==
a = 1
b = a
c = 1
print(a == b) # True
print(a == c) # True
print(a is b) # True
print(a is c) # True
a = {"name": "Foo"}
b = a
c = {"name": "Foo"}
print(a == b) # True
print(a == c) # True
print(a is b) # True
print(a is c) # False
print_function
from __future__ import print_function
print(23)
Dividers (no break or continue)
We will see how break and continue work, but first let's see a loop to find all the dividers on a number n.
i = 2
n = 3*5*7
while i < n:
if (n / i) * i == n:
print('{:2} divides {}'.format(i, n))
i = i + 1
3 divides 105
5 divides 105
7 divides 105
15 divides 105
21 divides 105
35 divides 105
Remove file
- os.remove
- os.unlink
Modules: more
-
sys.modules
-
imp.reload
-
reload
-
sys.modules to list loaded modules
-
imp.reload to reload module (Just reload before 3.3)
import __builtin__
def xx(name):
print("hello")
__builtin__.__import__ = xx;
print('body')
def f():
print("in f")
import sys
print('mod' in sys.modules) # False
import mod
print('mod' in sys.modules) # True
print(sys.modules['mod'])
# <module 'mod' from '/stuff/python/examples/modules/mod.py'>
print(sys.modules["sys"]) # <module 'sys' (built-in)>
import hooks
- import
Python resources
- Central Python site
- Python documentation
- Learning Python the Hard way
- Python Weekly
- PyCoder's Weekly
Progress bar
# http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
import time, sys
for i in range(10):
sys.stdout.write('\r' + '=' * i)
sys.stdout.flush()
time.sleep(1)
from __future__
from __future__ import print_function
from __future__ import division
or
from __future__ import print_function, division
See also future
We cannot import everything that is in future, because we don't know what will be in future in the future.... and we don't want to blindly change the behaviour of Python.
Variable scope
-
scope
-
There are two scopes: outside of all functions and inside of a function.
-
The first assignment to a variable defines it.
-
Variables that were declared outside all functions can be seen inside, but cannot be changed.
-
One can connect the outside name to an inside name using the 'global' keyword.
-
if and for blocks don't provide scoping.
a = 23
def main():
global b
b = 17
c = 42
print('a:', a) # a: 23
print('b:', b) # b: 17
print('c:', c) # c: 42
if True:
print('a:', a) # a: 23
print('b:', b) # b: 17
b = 99
print('b:', b) # b: 99
print('c:', c) # c: 42
print('a:', a) # a: 23
print('b:', b) # b: 99
print('c:', c) # c: 42
main()
print('a:', a) # a: 23
print('b:', b) # b: 99
print('c:', c) # c:
# Traceback (most recent call last):
# File "examples\basics\scope.py", line 27, in <module>
# print 'c:', c # c:
# NameError: name 'c' is not defined
global scope
scope
# x is global
x = 1
print(x, "- before sub")
def f():
#print(x, "- inside before declaration") # UnboundLocalError
x = 2
print(x, "- inside sub")
print(x, "- after sub declaration")
f()
print(x, "- after calling sub")
# 1 - before sub
# 1 - after sub declaration
# 2 - inside sub
# 1 - after calling sub
# x is global
def f():
#print(x, "- inside before declaration") # UnboundLocalError
x = 2
print(x, "- inside sub")
x = 1
print(x, "- before calling sub")
print(x, "- after sub declaration")
f()
print(x, "- after calling sub")
# 1 - before calling sub
# 1 - after sub declaration
# 2 - inside sub
# 1 - after calling sub
If we declare a variable outside of all the subroutines, it does not matter if we do it before the sub declaration, or after it. In neither case has the global variable any presence inside the sub.
def f():
x = 2
print(x, "- inside sub")
# print(x, " - after sub declaration") # NameError
f()
# print(x, " - after calling sub") # NameError
# 2 - inside sub
A name declared inside a subroutine is not visible outside.
def f():
global x
# print(x) # NameError
x = 2
print(x, "- inside sub")
# print(x, " - after sub declaration") # NameError
f()
print(x, "- after calling sub")
# 2 - inside sub
# 2 - after calling sub
Unless it was marked using the global word.
type
- type
- name
x = 2
y = '2'
z = [2, '2']
d = {}
def f():
pass
l = lambda q: q
class Cold():
pass
cold = Cold()
class Cnew(object):
pass
cnew = Cnew()
# r = xrange(10) # Python 3 does not have xrange
print(type(x)) # <type 'int'>
print(type(y)) # <type 'str'>
print(type(z)) # <type 'list'>
print(type(d)) # <type 'dict'>
print(type(f)) # <type 'function'>
print(type(l)) # <type 'function'>
print(type(Cold)) # <type 'classobj'>
print(type(cold)) # <type 'instance'>
print(type(Cnew)) # <type 'type'>
print(type(cnew)) # <class '__main__.Cnew'>
#print(type(r)) # <type 'xrange'>
print(type(x).__name__) # int
print(type(y).__name__) # str
print(type(z).__name__) # list
Look deeper in a list
x = ['abcd', 'efgh']
print(x) # ['abcd', 'efgh']
print(x[0:1]) # ['abcd']
print(x[0]) # 'abcd'
print(x[0][0]) # a
print(x[0][1]) # b
print(x[0][0:2]) # ab
More examples
import random
class Game:
def __init__(self):
self.lower_limit = 0
self.upper_limit = 200
self.number = random.randrange(self.lower_limit, self.upper_limit)
self.is_debug = False
self.running = True
def debug(self):
self.is_debug = not self.is_debug
def guess(self, num):
if num == 'd':
self.debug()
return
if self.is_debug:
print("Hidden number {}. Your guess is {}".format(self.number, num))
if num < self.number:
print("Too small")
elif num > self.number:
print("Too big")
else:
print("Bingo")
self.running = False
g = Game()
g.guess('d')
try:
g.guess('z')
except Exception as e:
print(e)
try:
g.guess('201')
except Exception as e:
print(e)
try:
g.guess('-1')
except Exception as e:
print(e)
Hidden number 137. Your guess is z
Not a Number z
Hidden number 137. Your guess is 201
Number 201 is too big
Hidden number 137. Your guess is -1
Number -1 is too small
import random
class SpaceShipError(Exception):
def __init__(self, inp):
self.inp = inp
class NumberTooBigError(SpaceShipError):
def __str__(self):
return "Number {} is too big".format(self.inp)
class NumberTooSmallError(SpaceShipError):
def __str__(self):
return "Number {} is too small".format(self.inp)
class NotANumberError(SpaceShipError):
def __str__(self):
return "Not a Number {}".format(self.inp)
class Game:
def __init__(self):
self.lower_limit = 0
self.upper_limit = 200
self.number = random.randrange(self.lower_limit, self.upper_limit)
self.is_debug = False
self.running = True
def debug(self):
self.is_debug = not self.is_debug
def guess(self, num):
if num == 'd':
self.debug()
return
if self.is_debug:
print("Hidden number {}. Your guess is {}".format(self.number, num))
try:
num = int(num)
except Exception:
raise NotANumberError(num)
if num > self.upper_limit:
raise NumberTooBigError(num)
if num < self.upper_limit:
raise NumberTooSmallError(num)
if num < self.number:
print("Too small")
elif num > self.number:
print("Too big")
else:
print("Bingo")
self.running = False
g = Game()
g.guess('d')
try:
g.guess('z')
except Exception as e:
print(e)
try:
g.guess('201')
except Exception as e:
print(e)
try:
g.guess('-1')
except Exception as e:
print(e)
#while g.running:
# guess = input("Please type in your guess: ")
# g.guess(int(guess))
This will run if there was no exception at all
Always executes. 6/2 ended.
Always executes. 6/0 ended.
Always executes. 6/a ended.
Traceback (most recent call last):
File "try.py", line 22, in <module>
main()
File "try.py", line 9, in main
divide(cnt, num)
File "try.py", line 3, in divide
return x/y
TypeError: unsupported operand type(s) for /: 'int' and 'str'
def divide(x, y):
return x/y
def main():
cnt = 6
for num in [2, 0, 'a']:
try:
divide(cnt, num)
except ZeroDivisionError:
pass
except (IOError, MemoryError) as err:
print(err)
else:
print("This will run if there was no exception at all")
finally:
print("Always executes. {}/{} ended.".format(cnt, num))
print("done")
main()
1
2
Fizz
4
Buzz
Fizz
7
8
Fizz
Buzz
11
Fizz
13
14
FizzBuzz
16
17
Fizz
19
Buzz
Fizz
22
23
Fizz
Buzz
26
Fizz
28
29
FizzBuzz
31
32
Fizz
34
Buzz
Fizz
37
38
Fizz
Buzz
41
Fizz
43
44
FizzBuzz
46
47
Fizz
49
Buzz
Fizz
52
53
Fizz
Buzz
56
Fizz
58
59
FizzBuzz
61
62
Fizz
64
Buzz
Fizz
67
68
Fizz
Buzz
71
Fizz
73
74
FizzBuzz
76
77
Fizz
79
Buzz
Fizz
82
83
Fizz
Buzz
86
Fizz
88
89
FizzBuzz
91
92
Fizz
94
Buzz
Fizz
97
98
Fizz
Buzz
def fizzbuzz():
for i in range(1, 101):
if i % 15 == 0:
print("FizzBuzz")
continue
if i % 3 == 0:
print("Fizz")
continue
if i % 5 == 0:
print("Buzz")
continue
print(i)
if __name__ == "__main__":
fizzbuzz()
import fb
def test_fb(capsys):
fb.fizzbuzz()
out, err = capsys.readouterr()
assert err == ''
with open('expected.txt') as fh:
expected = fh.read()
assert out == expected
import sys
import os
import time
if len(sys.argv) != 3:
exit(f"Usage: {sys.argv[0]} FILENAME count")
filename, count = sys.argv[1:]
print(f"start {os.getpid()}")
time.sleep(1)
for _ in range(int(count)):
try:
if not os.path.exists(filename):
with open(filename, 'w') as fh:
fh.write("0\n")
with open(filename, 'r') as fh:
number = int(fh.readline())
number += 1
with open(filename, 'w') as fh:
#fh.seek(0,0)
fh.write(f"{number}\n")
except Exception:
pass
print(f"done {os.getpid()}")
import sys
import os
import time
if len(sys.argv) != 3:
exit(f"Usage: {sys.argv[0]} FILENAME count")
filename, count = sys.argv[1:]
print(f"start {os.getpid()}")
time.sleep(1)
for _ in range(int(count)):
#try:
if not os.path.exists(filename):
with open(filename, 'w') as fh:
fh.write("0\n")
with open(filename, 'r+') as fh:
number = int(fh.readline())
number += 1
fh.seek(0,0)
fh.write(f"{number}\n")
# with open(filename, 'w') as fh:
# fh.write(f"{number}\n")
#except Exception:
# pass
print(f"done {os.getpid()}")
import subprocess
import sys
if len(sys.argv) != 4:
exit(f"Usage: {sys.argv[0]} FILENAME count processes")
filename, count, process_count = sys.argv[1:]
command = [sys.executable, 'count.py', filename, count]
processes = []
for _ in range(int(process_count)):
processes.append(subprocess.Popen(command))
print('Started')
for proc in processes:
proc.communicate()
print('Done')
name,birthdate,weight,height
Alice Archer,1997-01-10,57.9,1.56
Ben Brown,1985-02-15,72.5,1.77
Chloe Cooper,1983-03-22,53.6,1.65
Daniel Donovan,1981-04-30,83.1,1.75
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "bc5909c3-0e34-46b7-af44-d8b59bbd1817",
"metadata": {},
"outputs": [],
"source": [
"!pip install polars"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0422d14b-b3d5-407b-8594-31633a056594",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import datetime as dt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef584243-839b-47a3-a34a-52b5d8d5d4c2",
"metadata": {},
"outputs": [],
"source": [
"df = pl.DataFrame(\n",
" {\n",
" \"name\": [\"Alice Archer\", \"Ben Brown\", \"Chloe Cooper\", \"Daniel Donovan\"],\n",
" \"birthdate\": [\n",
" dt.date(1997, 1, 10),\n",
" dt.date(1985, 2, 15),\n",
" dt.date(1983, 3, 22),\n",
" dt.date(1981, 4, 30),\n",
" ],\n",
" \"weight\": [57.9, 72.5, 53.6, 83.1], # (kg)\n",
" \"height\": [1.56, 1.77, 1.65, 1.75], # (m)\n",
" }\n",
")\n",
"\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2943f6c6-2989-4b66-ac3f-37f218d578bb",
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7904a22a-aced-4618-951c-80afaeaf7ba5",
"metadata": {},
"outputs": [],
"source": [
"dir(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6cb1363a-0706-4fb8-9ce1-284c2ce14720",
"metadata": {},
"outputs": [],
"source": [
"df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "832f56a2-a543-4db5-ab41-be2c8cfa989a",
"metadata": {},
"outputs": [],
"source": [
"df.tail(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0ffa702-8f7e-488d-9175-82a2ae9c8738",
"metadata": {},
"outputs": [],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "362dc935-407a-44be-bbc9-1a7111b852ba",
"metadata": {},
"outputs": [],
"source": [
"df.write_csv(\"getting_started.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7bfde105-95dc-4b3f-9763-1f2e132b9b0e",
"metadata": {},
"outputs": [],
"source": [
"df_csv = pl.read_csv(\"getting_started.csv\", try_parse_dates=True)\n",
"print(df_csv)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70a7e566-33e1-471d-a71d-f7e645787355",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "58e1ecba-46f7-47f6-ad33-f708f07ee28d",
"metadata": {},
"outputs": [],
"source": [
"result = df.select(\n",
" pl.col(\"name\"),\n",
" (pl.col(\"weight\", \"height\") * 0.95).round(2).name.suffix(\"-5%\"),\n",
")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39d7eecf-dea3-409d-b904-5b260de8ca7d",
"metadata": {},
"outputs": [],
"source": [
"result = df.with_columns(\n",
" birth_year=pl.col(\"birthdate\").dt.year(),\n",
" bmi=pl.col(\"weight\") / (pl.col(\"height\") ** 2),\n",
")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbfce594-67a9-47d4-bdc1-5488f532163d",
"metadata": {},
"outputs": [],
"source": [
"result = df.filter(pl.col(\"birthdate\").dt.year() < 1990)\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1870d98c-d320-4fd3-8f03-4f43c8958a7c",
"metadata": {},
"outputs": [],
"source": [
"result = df.filter(\n",
" pl.col(\"birthdate\").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),\n",
" pl.col(\"height\") < 1.7,\n",
")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac8411e7-5f8a-46ad-aca4-83892af2b80b",
"metadata": {},
"outputs": [],
"source": [
"result = df.group_by(\n",
" (pl.col(\"birthdate\").dt.year() // 10 * 10).alias(\"decade\"),\n",
" maintain_order=True,\n",
").len()\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be3c0d55-c16c-4cda-8d04-c0fb56007833",
"metadata": {},
"outputs": [],
"source": [
"result = df.group_by(\n",
" (pl.col(\"birthdate\").dt.year() // 10 * 10).alias(\"decade\"),\n",
" maintain_order=True,\n",
").agg(\n",
" pl.len().alias(\"sample_size\"),\n",
" pl.col(\"weight\").mean().round(2).alias(\"avg_weight\"),\n",
" pl.col(\"height\").max().alias(\"tallest\"),\n",
")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29c60262-5dbf-49d6-b27f-fd81e11068fb",
"metadata": {},
"outputs": [],
"source": [
"result = (\n",
" df.with_columns(\n",
" (pl.col(\"birthdate\").dt.year() // 10 * 10).alias(\"decade\"),\n",
" pl.col(\"name\").str.split(by=\" \").list.first(),\n",
" )\n",
" .select(\n",
" pl.all().exclude(\"birthdate\"),\n",
" )\n",
" .group_by(\n",
" pl.col(\"decade\"),\n",
" maintain_order=True,\n",
" )\n",
" .agg(\n",
" pl.col(\"name\"),\n",
" pl.col(\"weight\", \"height\").mean().round(2).name.prefix(\"avg_\"),\n",
" )\n",
")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe9d8375-97a1-40f1-8c17-7b4e2994059e",
"metadata": {},
"outputs": [],
"source": [
"df2 = pl.DataFrame(\n",
" {\n",
" \"name\": [\"Ben Brown\", \"Daniel Donovan\", \"Alice Archer\", \"Chloe Cooper\"],\n",
" \"parent\": [True, False, False, False],\n",
" \"siblings\": [1, 2, 3, 4],\n",
" }\n",
")\n",
"print(df2)\n",
"print(df.join(df2, on=\"name\", how=\"left\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "477e99a9-a13a-457c-bcb2-2f45154a86f7",
"metadata": {},
"outputs": [],
"source": [
"df3 = pl.DataFrame(\n",
" {\n",
" \"name\": [\"Ethan Edwards\", \"Fiona Foster\", \"Grace Gibson\", \"Henry Harris\"],\n",
" \"birthdate\": [\n",
" dt.date(1977, 5, 10),\n",
" dt.date(1975, 6, 23),\n",
" dt.date(1973, 7, 22),\n",
" dt.date(1971, 8, 3),\n",
" ],\n",
" \"weight\": [67.9, 72.5, 57.6, 93.1], # (kg)\n",
" \"height\": [1.76, 1.6, 1.66, 1.8], # (m)\n",
" }\n",
")\n",
"print(df3)\n",
"print(pl.concat([df, df3], how=\"vertical\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa
7,4.6,3.4,1.4,0.3,Iris-setosa
8,5.0,3.4,1.5,0.2,Iris-setosa
9,4.4,2.9,1.4,0.2,Iris-setosa
10,4.9,3.1,1.5,0.1,Iris-setosa
11,5.4,3.7,1.5,0.2,Iris-setosa
12,4.8,3.4,1.6,0.2,Iris-setosa
13,4.8,3.0,1.4,0.1,Iris-setosa
14,4.3,3.0,1.1,0.1,Iris-setosa
15,5.8,4.0,1.2,0.2,Iris-setosa
16,5.7,4.4,1.5,0.4,Iris-setosa
17,5.4,3.9,1.3,0.4,Iris-setosa
18,5.1,3.5,1.4,0.3,Iris-setosa
19,5.7,3.8,1.7,0.3,Iris-setosa
20,5.1,3.8,1.5,0.3,Iris-setosa
21,5.4,3.4,1.7,0.2,Iris-setosa
22,5.1,3.7,1.5,0.4,Iris-setosa
23,4.6,3.6,1.0,0.2,Iris-setosa
24,5.1,3.3,1.7,0.5,Iris-setosa
25,4.8,3.4,1.9,0.2,Iris-setosa
26,5.0,3.0,1.6,0.2,Iris-setosa
27,5.0,3.4,1.6,0.4,Iris-setosa
28,5.2,3.5,1.5,0.2,Iris-setosa
29,5.2,3.4,1.4,0.2,Iris-setosa
30,4.7,3.2,1.6,0.2,Iris-setosa
31,4.8,3.1,1.6,0.2,Iris-setosa
32,5.4,3.4,1.5,0.4,Iris-setosa
33,5.2,4.1,1.5,0.1,Iris-setosa
34,5.5,4.2,1.4,0.2,Iris-setosa
35,4.9,3.1,1.5,0.1,Iris-setosa
36,5.0,3.2,1.2,0.2,Iris-setosa
37,5.5,3.5,1.3,0.2,Iris-setosa
38,4.9,3.1,1.5,0.1,Iris-setosa
39,4.4,3.0,1.3,0.2,Iris-setosa
40,5.1,3.4,1.5,0.2,Iris-setosa
41,5.0,3.5,1.3,0.3,Iris-setosa
42,4.5,2.3,1.3,0.3,Iris-setosa
43,4.4,3.2,1.3,0.2,Iris-setosa
44,5.0,3.5,1.6,0.6,Iris-setosa
45,5.1,3.8,1.9,0.4,Iris-setosa
46,4.8,3.0,1.4,0.3,Iris-setosa
47,5.1,3.8,1.6,0.2,Iris-setosa
48,4.6,3.2,1.4,0.2,Iris-setosa
49,5.3,3.7,1.5,0.2,Iris-setosa
50,5.0,3.3,1.4,0.2,Iris-setosa
51,7.0,3.2,4.7,1.4,Iris-versicolor
52,6.4,3.2,4.5,1.5,Iris-versicolor
53,6.9,3.1,4.9,1.5,Iris-versicolor
54,5.5,2.3,4.0,1.3,Iris-versicolor
55,6.5,2.8,4.6,1.5,Iris-versicolor
56,5.7,2.8,4.5,1.3,Iris-versicolor
57,6.3,3.3,4.7,1.6,Iris-versicolor
58,4.9,2.4,3.3,1.0,Iris-versicolor
59,6.6,2.9,4.6,1.3,Iris-versicolor
60,5.2,2.7,3.9,1.4,Iris-versicolor
61,5.0,2.0,3.5,1.0,Iris-versicolor
62,5.9,3.0,4.2,1.5,Iris-versicolor
63,6.0,2.2,4.0,1.0,Iris-versicolor
64,6.1,2.9,4.7,1.4,Iris-versicolor
65,5.6,2.9,3.6,1.3,Iris-versicolor
66,6.7,3.1,4.4,1.4,Iris-versicolor
67,5.6,3.0,4.5,1.5,Iris-versicolor
68,5.8,2.7,4.1,1.0,Iris-versicolor
69,6.2,2.2,4.5,1.5,Iris-versicolor
70,5.6,2.5,3.9,1.1,Iris-versicolor
71,5.9,3.2,4.8,1.8,Iris-versicolor
72,6.1,2.8,4.0,1.3,Iris-versicolor
73,6.3,2.5,4.9,1.5,Iris-versicolor
74,6.1,2.8,4.7,1.2,Iris-versicolor
75,6.4,2.9,4.3,1.3,Iris-versicolor
76,6.6,3.0,4.4,1.4,Iris-versicolor
77,6.8,2.8,4.8,1.4,Iris-versicolor
78,6.7,3.0,5.0,1.7,Iris-versicolor
79,6.0,2.9,4.5,1.5,Iris-versicolor
80,5.7,2.6,3.5,1.0,Iris-versicolor
81,5.5,2.4,3.8,1.1,Iris-versicolor
82,5.5,2.4,3.7,1.0,Iris-versicolor
83,5.8,2.7,3.9,1.2,Iris-versicolor
84,6.0,2.7,5.1,1.6,Iris-versicolor
85,5.4,3.0,4.5,1.5,Iris-versicolor
86,6.0,3.4,4.5,1.6,Iris-versicolor
87,6.7,3.1,4.7,1.5,Iris-versicolor
88,6.3,2.3,4.4,1.3,Iris-versicolor
89,5.6,3.0,4.1,1.3,Iris-versicolor
90,5.5,2.5,4.0,1.3,Iris-versicolor
91,5.5,2.6,4.4,1.2,Iris-versicolor
92,6.1,3.0,4.6,1.4,Iris-versicolor
93,5.8,2.6,4.0,1.2,Iris-versicolor
94,5.0,2.3,3.3,1.0,Iris-versicolor
95,5.6,2.7,4.2,1.3,Iris-versicolor
96,5.7,3.0,4.2,1.2,Iris-versicolor
97,5.7,2.9,4.2,1.3,Iris-versicolor
98,6.2,2.9,4.3,1.3,Iris-versicolor
99,5.1,2.5,3.0,1.1,Iris-versicolor
100,5.7,2.8,4.1,1.3,Iris-versicolor
101,6.3,3.3,6.0,2.5,Iris-virginica
102,5.8,2.7,5.1,1.9,Iris-virginica
103,7.1,3.0,5.9,2.1,Iris-virginica
104,6.3,2.9,5.6,1.8,Iris-virginica
105,6.5,3.0,5.8,2.2,Iris-virginica
106,7.6,3.0,6.6,2.1,Iris-virginica
107,4.9,2.5,4.5,1.7,Iris-virginica
108,7.3,2.9,6.3,1.8,Iris-virginica
109,6.7,2.5,5.8,1.8,Iris-virginica
110,7.2,3.6,6.1,2.5,Iris-virginica
111,6.5,3.2,5.1,2.0,Iris-virginica
112,6.4,2.7,5.3,1.9,Iris-virginica
113,6.8,3.0,5.5,2.1,Iris-virginica
114,5.7,2.5,5.0,2.0,Iris-virginica
115,5.8,2.8,5.1,2.4,Iris-virginica
116,6.4,3.2,5.3,2.3,Iris-virginica
117,6.5,3.0,5.5,1.8,Iris-virginica
118,7.7,3.8,6.7,2.2,Iris-virginica
119,7.7,2.6,6.9,2.3,Iris-virginica
120,6.0,2.2,5.0,1.5,Iris-virginica
121,6.9,3.2,5.7,2.3,Iris-virginica
122,5.6,2.8,4.9,2.0,Iris-virginica
123,7.7,2.8,6.7,2.0,Iris-virginica
124,6.3,2.7,4.9,1.8,Iris-virginica
125,6.7,3.3,5.7,2.1,Iris-virginica
126,7.2,3.2,6.0,1.8,Iris-virginica
127,6.2,2.8,4.8,1.8,Iris-virginica
128,6.1,3.0,4.9,1.8,Iris-virginica
129,6.4,2.8,5.6,2.1,Iris-virginica
130,7.2,3.0,5.8,1.6,Iris-virginica
131,7.4,2.8,6.1,1.9,Iris-virginica
132,7.9,3.8,6.4,2.0,Iris-virginica
133,6.4,2.8,5.6,2.2,Iris-virginica
134,6.3,2.8,5.1,1.5,Iris-virginica
135,6.1,2.6,5.6,1.4,Iris-virginica
136,7.7,3.0,6.1,2.3,Iris-virginica
137,6.3,3.4,5.6,2.4,Iris-virginica
138,6.4,3.1,5.5,1.8,Iris-virginica
139,6.0,3.0,4.8,1.8,Iris-virginica
140,6.9,3.1,5.4,2.1,Iris-virginica
141,6.7,3.1,5.6,2.4,Iris-virginica
142,6.9,3.1,5.1,2.3,Iris-virginica
143,5.8,2.7,5.1,1.9,Iris-virginica
144,6.8,3.2,5.9,2.3,Iris-virginica
145,6.7,3.3,5.7,2.5,Iris-virginica
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica
150,5.9,3.0,5.1,1.8,Iris-virginica
{
"cells": [
{
"cell_type": "markdown",
"id": "f90cdfdc-bc05-4cdc-b900-022517b33b41",
"metadata": {},
"source": [
"[Iris flower data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) with [Polars](https://pola.rs/)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "fd4ec3fc-c2d8-4a50-bc9c-99cafefb9647",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting polars\n",
" Downloading polars-1.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)\n",
"Downloading polars-1.20.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m32.9/32.9 MB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: polars\n",
"Successfully installed polars-1.20.0\n"
]
}
],
"source": [
"!pip install polars"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2f2387a7-a2aa-4c66-812d-aff6814fc4e9",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "dfdd0b3a-ae63-49cb-80a0-a1a4fab76366",
"metadata": {},
"outputs": [],
"source": [
"df = pl.scan_csv(\"iris.csv\").collect()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "070e59b6-6d06-43d9-b2b6-1edff05059d6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (150, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Id</th><th>SepalLengthCm</th><th>SepalWidthCm</th><th>PetalLengthCm</th><th>PetalWidthCm</th><th>Species</th></tr><tr><td>i64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>str</td></tr></thead><tbody><tr><td>1</td><td>5.1</td><td>3.5</td><td>1.4</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>2</td><td>4.9</td><td>3.0</td><td>1.4</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>3</td><td>4.7</td><td>3.2</td><td>1.3</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>4</td><td>4.6</td><td>3.1</td><td>1.5</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>5</td><td>5.0</td><td>3.6</td><td>1.4</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>146</td><td>6.7</td><td>3.0</td><td>5.2</td><td>2.3</td><td>"Iris-virginica"</td></tr><tr><td>147</td><td>6.3</td><td>2.5</td><td>5.0</td><td>1.9</td><td>"Iris-virginica"</td></tr><tr><td>148</td><td>6.5</td><td>3.0</td><td>5.2</td><td>2.0</td><td>"Iris-virginica"</td></tr><tr><td>149</td><td>6.2</td><td>3.4</td><td>5.4</td><td>2.3</td><td>"Iris-virginica"</td></tr><tr><td>150</td><td>5.9</td><td>3.0</td><td>5.1</td><td>1.8</td><td>"Iris-virginica"</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (150, 6)\n",
"┌─────┬───────────────┬──────────────┬───────────────┬──────────────┬────────────────┐\n",
"│ Id ┆ SepalLengthCm ┆ SepalWidthCm ┆ PetalLengthCm ┆ PetalWidthCm ┆ Species │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n",
"╞═════╪═══════════════╪══════════════╪═══════════════╪══════════════╪════════════════╡\n",
"│ 1 ┆ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 2 ┆ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 3 ┆ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 4 ┆ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 5 ┆ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ Iris-setosa │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ 146 ┆ 6.7 ┆ 3.0 ┆ 5.2 ┆ 2.3 ┆ Iris-virginica │\n",
"│ 147 ┆ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ Iris-virginica │\n",
"│ 148 ┆ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ Iris-virginica │\n",
"│ 149 ┆ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ Iris-virginica │\n",
"│ 150 ┆ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ Iris-virginica │\n",
"└─────┴───────────────┴──────────────┴───────────────┴──────────────┴────────────────┘"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "d72c9b12-0a65-4aa3-9490-646018d49940",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"polars.dataframe.frame.DataFrame"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "13aad9e3-76fb-4b26-9c29-e30b0dbc6410",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['__add__',\n",
" '__annotations__',\n",
" '__array__',\n",
" '__arrow_c_stream__',\n",
" '__bool__',\n",
" '__class__',\n",
" '__contains__',\n",
" '__copy__',\n",
" '__dataframe__',\n",
" '__deepcopy__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__dir__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__floordiv__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__getitem__',\n",
" '__getstate__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__init_subclass__',\n",
" '__iter__',\n",
" '__le__',\n",
" '__len__',\n",
" '__lt__',\n",
" '__mod__',\n",
" '__module__',\n",
" '__mul__',\n",
" '__ne__',\n",
" '__new__',\n",
" '__radd__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__reversed__',\n",
" '__rmul__',\n",
" '__setattr__',\n",
" '__setitem__',\n",
" '__setstate__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__sub__',\n",
" '__subclasshook__',\n",
" '__truediv__',\n",
" '__weakref__',\n",
" '_accessors',\n",
" '_cast_all_from_to',\n",
" '_comp',\n",
" '_compare_to_non_df',\n",
" '_compare_to_other_df',\n",
" '_df',\n",
" '_div',\n",
" '_from_arrow',\n",
" '_from_pandas',\n",
" '_from_pydf',\n",
" '_ipython_key_completions_',\n",
" '_replace',\n",
" '_repr_html_',\n",
" '_row_encode',\n",
" '_to_metadata',\n",
" '_to_pandas_with_object_columns',\n",
" '_to_pandas_without_object_columns',\n",
" 'approx_n_unique',\n",
" 'bottom_k',\n",
" 'cast',\n",
" 'clear',\n",
" 'clone',\n",
" 'collect_schema',\n",
" 'columns',\n",
" 'corr',\n",
" 'count',\n",
" 'describe',\n",
" 'deserialize',\n",
" 'drop',\n",
" 'drop_in_place',\n",
" 'drop_nans',\n",
" 'drop_nulls',\n",
" 'dtypes',\n",
" 'equals',\n",
" 'estimated_size',\n",
" 'explode',\n",
" 'extend',\n",
" 'fill_nan',\n",
" 'fill_null',\n",
" 'filter',\n",
" 'flags',\n",
" 'fold',\n",
" 'gather_every',\n",
" 'get_column',\n",
" 'get_column_index',\n",
" 'get_columns',\n",
" 'glimpse',\n",
" 'group_by',\n",
" 'group_by_dynamic',\n",
" 'hash_rows',\n",
" 'head',\n",
" 'height',\n",
" 'hstack',\n",
" 'insert_column',\n",
" 'interpolate',\n",
" 'is_duplicated',\n",
" 'is_empty',\n",
" 'is_unique',\n",
" 'item',\n",
" 'iter_columns',\n",
" 'iter_rows',\n",
" 'iter_slices',\n",
" 'join',\n",
" 'join_asof',\n",
" 'join_where',\n",
" 'lazy',\n",
" 'limit',\n",
" 'map_rows',\n",
" 'max',\n",
" 'max_horizontal',\n",
" 'mean',\n",
" 'mean_horizontal',\n",
" 'median',\n",
" 'melt',\n",
" 'merge_sorted',\n",
" 'min',\n",
" 'min_horizontal',\n",
" 'n_chunks',\n",
" 'n_unique',\n",
" 'null_count',\n",
" 'partition_by',\n",
" 'pipe',\n",
" 'pivot',\n",
" 'plot',\n",
" 'product',\n",
" 'quantile',\n",
" 'rechunk',\n",
" 'rename',\n",
" 'replace_column',\n",
" 'reverse',\n",
" 'rolling',\n",
" 'row',\n",
" 'rows',\n",
" 'rows_by_key',\n",
" 'sample',\n",
" 'schema',\n",
" 'select',\n",
" 'select_seq',\n",
" 'serialize',\n",
" 'set_sorted',\n",
" 'shape',\n",
" 'shift',\n",
" 'shrink_to_fit',\n",
" 'slice',\n",
" 'sort',\n",
" 'sql',\n",
" 'std',\n",
" 'style',\n",
" 'sum',\n",
" 'sum_horizontal',\n",
" 'tail',\n",
" 'to_arrow',\n",
" 'to_dict',\n",
" 'to_dicts',\n",
" 'to_dummies',\n",
" 'to_init_repr',\n",
" 'to_jax',\n",
" 'to_numpy',\n",
" 'to_pandas',\n",
" 'to_series',\n",
" 'to_struct',\n",
" 'to_torch',\n",
" 'top_k',\n",
" 'transpose',\n",
" 'unique',\n",
" 'unnest',\n",
" 'unpivot',\n",
" 'unstack',\n",
" 'update',\n",
" 'upsample',\n",
" 'var',\n",
" 'vstack',\n",
" 'width',\n",
" 'with_columns',\n",
" 'with_columns_seq',\n",
" 'with_row_count',\n",
" 'with_row_index',\n",
" 'write_avro',\n",
" 'write_clipboard',\n",
" 'write_csv',\n",
" 'write_database',\n",
" 'write_delta',\n",
" 'write_excel',\n",
" 'write_ipc',\n",
" 'write_ipc_stream',\n",
" 'write_json',\n",
" 'write_ndjson',\n",
" 'write_parquet']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(df)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8c126b37-4a9b-4582-b8e9-5c7fe45afedc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Id',\n",
" 'SepalLengthCm',\n",
" 'SepalWidthCm',\n",
" 'PetalLengthCm',\n",
" 'PetalWidthCm',\n",
" 'Species']"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f0e0d26b-5b26-4127-809e-3d132a7c5cdd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (3, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Id</th><th>SepalLengthCm</th><th>SepalWidthCm</th><th>PetalLengthCm</th><th>PetalWidthCm</th><th>Species</th></tr><tr><td>i64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>str</td></tr></thead><tbody><tr><td>1</td><td>5.1</td><td>3.5</td><td>1.4</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>2</td><td>4.9</td><td>3.0</td><td>1.4</td><td>0.2</td><td>"Iris-setosa"</td></tr><tr><td>3</td><td>4.7</td><td>3.2</td><td>1.3</td><td>0.2</td><td>"Iris-setosa"</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (3, 6)\n",
"┌─────┬───────────────┬──────────────┬───────────────┬──────────────┬─────────────┐\n",
"│ Id ┆ SepalLengthCm ┆ SepalWidthCm ┆ PetalLengthCm ┆ PetalWidthCm ┆ Species │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n",
"╞═════╪═══════════════╪══════════════╪═══════════════╪══════════════╪═════════════╡\n",
"│ 1 ┆ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 2 ┆ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ Iris-setosa │\n",
"│ 3 ┆ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ Iris-setosa │\n",
"└─────┴───────────────┴──────────────┴───────────────┴──────────────┴─────────────┘"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "2991a8d6-52cd-4ae0-b66c-6ae5f5f0476c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (2, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Id</th><th>SepalLengthCm</th><th>SepalWidthCm</th><th>PetalLengthCm</th><th>PetalWidthCm</th><th>Species</th></tr><tr><td>i64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>str</td></tr></thead><tbody><tr><td>149</td><td>6.2</td><td>3.4</td><td>5.4</td><td>2.3</td><td>"Iris-virginica"</td></tr><tr><td>150</td><td>5.9</td><td>3.0</td><td>5.1</td><td>1.8</td><td>"Iris-virginica"</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (2, 6)\n",
"┌─────┬───────────────┬──────────────┬───────────────┬──────────────┬────────────────┐\n",
"│ Id ┆ SepalLengthCm ┆ SepalWidthCm ┆ PetalLengthCm ┆ PetalWidthCm ┆ Species │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n",
"╞═════╪═══════════════╪══════════════╪═══════════════╪══════════════╪════════════════╡\n",
"│ 149 ┆ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ Iris-virginica │\n",
"│ 150 ┆ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ Iris-virginica │\n",
"└─────┴───────────────┴──────────────┴───────────────┴──────────────┴────────────────┘"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail(2)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "91776e1e-1355-47de-b08a-fc76204ba086",
"metadata": {},
"outputs": [],
"source": [
"df = pl.scan_csv(\"iris.csv\").filter(pl.col(\"SepalLengthCm\") > 5).group_by(\"Species\").agg(pl.all().sum()).collect()\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "75a2a13e-91bc-4005-8347-4244a8669911",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (3, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Species</th><th>Id</th><th>SepalLengthCm</th><th>SepalWidthCm</th><th>PetalLengthCm</th><th>PetalWidthCm</th></tr><tr><td>str</td><td>i64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td></tr></thead><tbody><tr><td>"Iris-setosa"</td><td>564</td><td>116.9</td><td>81.7</td><td>33.2</td><td>6.1</td></tr><tr><td>"Iris-versicolor"</td><td>3562</td><td>281.9</td><td>131.8</td><td>202.9</td><td>63.3</td></tr><tr><td>"Iris-virginica"</td><td>6168</td><td>324.5</td><td>146.2</td><td>273.1</td><td>99.6</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (3, 6)\n",
"┌─────────────────┬──────┬───────────────┬──────────────┬───────────────┬──────────────┐\n",
"│ Species ┆ Id ┆ SepalLengthCm ┆ SepalWidthCm ┆ PetalLengthCm ┆ PetalWidthCm │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═════════════════╪══════╪═══════════════╪══════════════╪═══════════════╪══════════════╡\n",
"│ Iris-setosa ┆ 564 ┆ 116.9 ┆ 81.7 ┆ 33.2 ┆ 6.1 │\n",
"│ Iris-versicolor ┆ 3562 ┆ 281.9 ┆ 131.8 ┆ 202.9 ┆ 63.3 │\n",
"│ Iris-virginica ┆ 6168 ┆ 324.5 ┆ 146.2 ┆ 273.1 ┆ 99.6 │\n",
"└─────────────────┴──────┴───────────────┴──────────────┴───────────────┴──────────────┘"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}