#!/usr/bin/env python # -*- coding: utf-8 -*- import re class URL: def __init__(self, url): self.url = url.strip() match = re.search(r'(?P<protocol>.*?)://(?P<host>[1-9a-zA-z.]*?)(?::(?P<port>\d{1,5}))?/(?P<path>.*?)?(\?(?P<args>(?:[^=&" ]*=[^=&" ]*(?:&|$))*)?)', url) if not match: raise Exception('not find url') self.protocol = match.group('protocol') self.host = match.group('host') self.path = match.group('path') self.args = {} for arg in match.group('args').split('&'): a = arg.split('=') self.args[a[0]] = a[1] if __name__ == '__main__': url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=python%20re&oq=python%2520re&rsv_pq=81e580d10006294c&rsv_t=aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto&rqlang=cn&rsv_enter=0' u = URL(url) print('url: %s' % u.url) print('protocol: %s' % u.protocol) print('host: %s' % u.host) print('path: %s' % u.path) print('args: %s' % u.args)
输出:
url: https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=python%20re&oq=python%2520re&rsv_pq=81e580d10006294c&rsv_t=aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto&rqlang=cn&rsv_enter=0 protocol: https host: www.baidu.com path: s args: {'wd': 'python%20re', 'f': '8', 'rsv_enter': '0', 'rsv_bp': '1', 'rsv_t': 'aca82%2FkFyqJi4GCe8V9LnetUu1zQF2xkVlA3JNhdyri%2BBVm9VZPW19tSMto', 'rsv_idx': '1', 'tn': 'baidu', 'rqlang': 'cn', 'rsv_pq': '81e580d10006294c', 'ie': 'utf-8', 'oq': 'python%2520re'}
评论