#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
class URL:
def __init__(self, url):
self.url = url.strip()
match = re.search(r'(?P<protocol>.*?)://(?P<host>[1-9a-zA-z.]*?)(?::(?P<port>\d{1,5}))?/(?P<path>.*?)?(\?(?P<args>(?:[^=&" ]*=[^=&" ]*(?:&|$))*)?)', url)
if not match:
raise Exception('not find url')
self.protocol = match.group('protocol')
self.host = match.group('host')
self.path = match.group('path')
self.args = {}
for arg in match.group('args').split('&'):
a = arg.split('=')
self.args[a[0]] = a[1]
if __name__ == '__main__':
url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=python%20re&oq=python%2520re&rsv_pq=81e580d10006294c&rsv_t=aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto&rqlang=cn&rsv_enter=0'
u = URL(url)
print('url: %s' % u.url)
print('protocol: %s' % u.protocol)
print('host: %s' % u.host)
print('path: %s' % u.path)
print('args: %s' % u.args)
输出:
url: https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=python%20re&oq=python%2520re&rsv_pq=81e580d10006294c&rsv_t=aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto&rqlang=cn&rsv_enter=0
protocol: https
host: www.baidu.com
path: s
args: {'wd': 'python%20re', 'f': '8', 'rsv_enter': '0', 'rsv_bp': '1', 'rsv_t': 'aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto', 'rsv_idx': '1', 'tn': 'baidu', 'rqlang': 'cn', 'rsv_pq': '81e580d10006294c', 'ie': 'utf-8', 'oq': 'python%2520re'}
评论