¶ index_static.py

2010-05-31 11:10

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
index_static.py
    - just fixed for pyblosxom-cmd  staticrender export all .html links
base index.py from:
Ryan Barrett <[email protected]>
 
This plugin displays an alphabetical index of all entries. It uses these
optional config variables from config.py, shown here with their defaults:
 
py['index_trigger']            = '/site-index'
py['index_num_columns']        = 2
py['index_letters_first']      = True
py['index_title']              = 'index'
py['index_use_story_template'] = True
 
 
VERSION:
0.2
 
TODO:
- use a template instead of hard-coded HTML
 
Copyright 2006 Ryan Barrett
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
"""
import math
import os.path
import time
from Pyblosxom import tools
import Pyblosxom.entries
 
__author__ = 'Ryan Barrett'
__version__ = '0.2'
__description__ = 'Displays an alphabetical index of all entries.'
 
 
def verify_installation(request):
  return 1
 
def cb_filelist(args):
  request = args['request']
  http = request.getHttp()
  data = request.getData()
  config = request.getConfiguration()
 
  trigger = config.get('index_trigger', 'site-index')
  if http['PATH_INFO'] != trigger:
    return
 
  # get the entries
  datadir = config['datadir']
  files = tools.Walk(request, datadir)
  files.sort()
 
  # sort into sections, one for each letter. the dictionary is
  # letter => (entry name, path) where path is the relative to datadir.
  sections = {}
  entry_extensions = data['extensions'].keys()
 
  for file in files:
    assert file.startswith(datadir)
    path, ext = os.path.splitext(file[len(datadir):])
    if ext[1:] in entry_extensions:  # strip the leading period from ext
      entry_name = os.path.basename(path)
      sections.setdefault(entry_name[0].upper(), []).append((entry_name, path))
 
  # extract the first letters. sort as usual, except that numbers and other
  # non-letters go *after* letters.
  def letters_before_symbols(a, b):
    if a.isalpha() and not b.isalpha():
      return -1
    elif not a.isalpha() and b.isalpha():
      return 1
    else:
      return cmp(a, b)
 
  letters = sections.keys()
  if config.get('index_letters_first', 1):
    letters.sort(letters_before_symbols)
  else:
    letters.sort()
 
  # add the header with links to each section
  body = '<p class="index-header">'
  letter_links = ['<a href="#%s">%s</a>' % (l, l) for l in letters]
  body += ' |\n'.join(letter_links)
  body += '</p>\n<hr class="index">\n\n'
 
  # add the sections themselves, with one link per entry, in a table. the
  # number of columns is taken from the index_num_columns config variable.
  # entries are ordered down each column, in order.
  num_cols = config.get('index_num_columns', 2)
 
  for l in letters:
    body += '<h3 class="index">%s</h3> <a name="%s"></a>\n' % (l, l)
    body += '\n'
 
    entries = sections[l]
    entries.sort()
    num_rows = int(math.ceil(float(len(entries)) / num_cols))
 
    for row in range(0, num_rows):
      # alternate the  tags' class between index-row-stripe-0 and
      # index-row-stripe-1, so you can use CSS to alternate their color for
      # readability, if you want.
      body += '\n' % (row % 2)
      for col in range(0, num_cols):
        entry_index = col * num_rows + row
        if entry_index < len(entries):
          entry_name, path = entries[entry_index]
        else:
          entry_name = path = ''
        body += '\n' % (path[1:], entry_name)
      body += '\n'
 
    body += '<table class="index"><tbody><tr></tr><tr class="index-row-stripe-%d"><td><a href="%s.html">%s</a></td></tr></tbody></table>\n<hr class="index">\n\n'
 
  data = {'title': config.get('index_title', 'index')}
  # use the epoch for mtime. otherwise, pyblosxom uses the current time, which
  # makes other plugins (like weblogsping) think this is a new entry.
  epoch = time.localtime(0)
  fe = Pyblosxom.entries.base.generate_entry(request, data, body, epoch)
  return [fe]
 
def cb_story(args):
  request = args['request']
  http = request.getHttp()
  config = request.getConfiguration()
  trigger = config.get('index_trigger', 'site-index')
 
  if (http['PATH_INFO'] == trigger and
      not config.get('index_use_story_template', 1)):
    title = config.get('index_title', 'index')
    args['template'] = '<h1 class="index">%s</h1>\n<hr>\n$body' % title
 
  return args

§ 写于: Mon, 31 May 2010 | 永久链接;源文: rdf ,rss ,raw | 分类: /techic/PyBlosxom/plugins §
[MailMe] [Print] Creative Commons License

作品Zoom.Quiet创作,采用知识共享署名-相同方式共享 2.5 中国大陆许可协议进行许可。 基于zoomquiet.org上的作品创作。

¶ hardcodedates.py

2010-05-31 11:10

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
from Pyblosxom import tools
import os, os.path, posix, re, stat, time
 
FILETIME = re.compile('^([0-9]{4})-([0-1][0-9])-([0-3][0-9])-([0-2][0-9])-([0-5][0-9]) +(.*)$')
 
all_timestamps = {}
extensions = []
timestamps_to_save = {}
 
 
#mode: python; indent-tabs-mode: t, tab-width: 4
"""
This allows the user to create a file "timestamps" in their datadir,
that will override the timestamp of any given blog entry. Each line
in this file should be of the form "YYYY-MM-DD-hh-mm file-name".
Then for any entry that one of these lines exist the system will use
that timestamp instead of the actual files modification time.
 
Note: the filename is relative to your data-dir.
Example of a line for the file /var/data-dir/school/abc.txt
   where the datadir is "/var/data-dir/" and the date is Aug 9, 2004.
 
2004-08-09-00-00 school/abc.txt
"""
__author__ = 'Nathan Kent Bullock, Ryan Barrett'
__email__ = 'nathan_kent_bullock -at- yahoo.ca, hardcodedates -at- ryanb.org'
__version__ = '1.4'
def init(request):
    if all_timestamps:
        return  # already initialized
 
    datadir = request.getConfiguration()['datadir']
    timestamp_file = os.path.join(datadir, 'timestamps')
 
    if os.path.isfile(timestamp_file):
        f = file(timestamp_file)
        for str in f.readlines():
            m = FILETIME.search(str.strip())
            if m:
                year = int(m.group(1))
                mo = int(m.group(2))
                day = int(m.group(3))
                hr = int(m.group(4))
                minute = int(m.group(5))
                mtime = time.mktime((year,mo,day,hr,minute,0,0,0,-1))
                filename = os.path.join(datadir, m.group(6))
                all_timestamps[filename] = mtime
 
        f.close()
 
    extensions.extend(request.getData()['extensions'].keys())
    extensions.append(request.getConfiguration().get('comment_ext', 'cmt'))
 
def cb_filestat(args):
    request = args['request']
    init(request)
 
    filename = args['filename']
    extension = os.path.splitext(filename)[1][1:]
    datadir = request.getConfiguration()['datadir']
 
    if all_timestamps.has_key(filename):
        # we know this file's timestamp
        mtime = args['mtime']
        assert isinstance(mtime, (tuple, posix.stat_result))
        args['mtime'] = (mtime[0:stat.ST_MTIME] + (all_timestamps[filename],) +
                         mtime[stat.ST_MTIME + 1:])
 
    elif extension in extensions and filename.startswith(datadir):
        # we don't know it, but we should. ask the os for it, and remember it.
        args['mtime'] = os.stat(filename)
        all_timestamps[filename] = args['mtime'][stat.ST_MTIME]
        timestamps_to_save[filename] = args['mtime'][stat.ST_MTIME]
 
    return args
 
def cb_end(args):
    if timestamps_to_save:
        datadir = args['request'].getConfiguration()['datadir']
        datadir = os.path.normpath(datadir)
        tsfile = file(os.path.join(datadir, 'timestamps'), 'a')
        for filename, mtime in timestamps_to_save.items():
            time_str = time.strftime('%Y-%m-%d-%H-%M', time.localtime(mtime))
            # strip the datadir prefix and directory separator slash
            filename = filename[len(datadir) + 1:]
            tsfile.write('%s %s\n' % (time_str, filename))
            tools.getLogger().info('Saved mtime %s for %s' % (time_str, filename))
        tsfile.close()
        timestamps_to_save.clear()

§ 写于: Mon, 31 May 2010 | 永久链接;源文: rdf ,rss ,raw | 分类: /techic/PyBlosxom/plugins §
[MailMe] [Print] Creative Commons License

作品Zoom.Quiet创作,采用知识共享署名-相同方式共享 2.5 中国大陆许可协议进行许可。 基于zoomquiet.org上的作品创作。

¶ category_static.py

2010-05-31 11:10

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# -*- coding: utf-8 -*-
"""
category_static.py
    - just fixed for pyblosxom-cmd  staticrender export all .html links
base index.py from:
Ryan Barrett <[email protected]>
    - Zoom.Quiet 100401
 
This plugin displays an alphabetical index of all entries. It uses these
optional config variables from config.py, shown here with their defaults:
 
py['cindex_trigger']            = '/site-index'
py['cindex_num_columns']        = 2
py['cindex_letters_first']      = True
py['cindex_title']              = 'index'
py['cindex_use_story_template'] = True
 
 
VERSION:
0.2
 
TODO:
- use a template instead of hard-coded HTML
 
Copyright 2006 Ryan Barrett
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
"""
 
 
import math
import os.path
import time
from operator import itemgetter
from Pyblosxom import tools
import Pyblosxom.entries
#import re
 
__author__ = 'Zoom.Quiet <zoomquiet+pyb at="" gmail="" dot="" com="">'
__version__ = '11.09.7'
__description__ = 'Displays an Category index as Tree for all entries. in one page'
DEFAULT_ROOT = []
def verify_installation(request):
  return 1
 
def cb_filelist(args):
    request = args['request']
    http = request.getHttp()
    data = request.getData()
    config = request.getConfiguration()
    _baseurl = config.get("base_url", "")
 
    trigger = config.get('cindex_trigger', 'site-index')
    if http['PATH_INFO'] != trigger:
        return
 
    # get the entries
    datadir = config['datadir']
    files = tools.Walk(request, datadir)
    files.sort()
 
    body = '<div id="categoriselist">'
    #print files
    # sort into sections, one for each letter. the dictionary is
    # letter => (entry name, path) where path is the relative to datadir.
    #sections = {}
    #   the entrise dictionary is
    # path => (entry name, 0)
    entrise = []
 
    entry_extensions = data['extensions'].keys()
 
    for file in files:
        #objEntry = entries.fileentry.FileEntry(request, file,datadir)
        #print objEntry.keys() 
        assert file.startswith(datadir)
        path, ext = os.path.splitext(file[len(datadir):])
        if ext[1:] in entry_extensions:  # strip the leading period from ext
            entry_name = os.path.basename(path)
            #sections.setdefault(entry_name[0].upper(), []).append((entry_name, path))
            entrise.append((entry_name,path,file))
    #print entrise
    #sortPaths = sorted(entrise.iteritems(), key=itemgetter(1), reverse=True)
    #print sortPaths
    #print _baseurl
    etree = {}
    """{
    "pathID":[(path.split()),"title",...]
    ,
    }
    """
    for entry in entrise:
        e = Pyblosxom.entries.fileentry.FileEntry(request, entry[2], entry[1])
        deeps = entry[1].split("/")[:-1]
        pathID = "".join(deeps)
        if pathID in etree:
            etree[pathID].append((e['title'],entry[1]))
        else:
            etree[pathID]= [tuple(deeps),(e['title'],entry[1])]
    #print etree.keys()
    root_path_list = config.get("category_root_list", DEFAULT_ROOT)
    root_entry_list = []
    for opath in root_path_list:
        #print opath
        crtRoot = []
        for k in etree.keys():
            if opath in k:
                crtRoot.append(k)
        crtRoot.sort()
        root_entry_list.append((opath,crtRoot))
 
    '''root_entry_list as::
    [('Zen', ['ZenChinese', 'ZenGoogle', 'Zenpythonic']), ('oss', ['oss', 'ossFreeBSD', 'ossMozillaFireFox', 'ossUbuntu']), ('opening', []), ('mind', ['mind']), ('Quiet', ['Quietliving', 'Quietnomeans']), ('utility', ['utilitySubversion', 'utilitySubversionhooks', 'utilitypy4strStructuredText', 'utilitypy4webDjango', 'utilitypy4webMoinMoin', 'utilitypy4webQuixote', 'utilitypy4zh', 'utilityzqlib']), ('internet', ['internet', 'internetFolksonomy']), ('easy', ['easymovie', 'easymusic']), ('techic', ['techic', 'techicEmacs', 'techicPyBlosxom', 'techicPyBlosxomblosxom', 'techicPyBlosxomplugins'])]
    '''
    body += '<h3>/</h3>'
    for e in etree[''][1:]:
        #print "etree[''] include::",e
        body += '<span class="indents">%s</span><a href="%s%s.html">%s</a><br>\n'%(
            "...."
            ,_baseurl
            ,e[1]
            ,e[0]
            )
    #print root_entry_list
 
    for k in root_entry_list:
        #['techic', 'techicEmacs', 'techicPyBlosxom', 'techicPyBlosxomblosxom', 'techicPyBlosxomplugins']
        body += '<h4>%s/</h4>'%k[0]
        cpath = ""
        for p in k[1]:
            #print etree[p]
            #[('', 'Zen', 'Chinese'), '9.18', 'CC Salon BJ', '\xe2\x80\x9c\xe5\x9b\xbd\xe9\x99\x85\xe8\x87\xaa\xe7\x94\xb1\xe8\xbd\xaf\xe4\xbb\xb6\xe6\x97\xa5\xe2\x80\x9d\xe4\xb9\x8b\xe5\xa4\xb4\xe8\x84\x91\xe9\xa3\x8e\xe6\x9a\xb4', '\xe8\xa1\xa8\xe5\xbd\xa2\xe7\xa0\x81\xe7\x9a\x84\xe6\xb6\x88\xe4\xba\xa1\xe8\x83\x8c\xe6\x99\xaf\xef\xbc\x81']
            epath = "/".join(etree[p][0][2:])
            if k[0] != "".join(etree[p][0]):
                if cpath != epath:
                    cpath = epath
                    ldeep = len(etree[p][0][1:])
                    if 3 > ldeep:
                        body += '<h5>%s/</h5>'%"/".join(etree[p][0][2:])
                    else:
                        body += '<h6>%s/</h6>'%"/".join(etree[p][0][3:])
            for e in etree[p][1:]:
                body += '<span id="%s" class="indents">%s</span><a href="%s%s.html">%s</a><br>\n'%(
                        "/".join(etree[p][0])
                        ,"..."*len(etree[p][0])
                        ,_baseurl
                        ,e[1]
                        ,e[0]
                        )
 
    '''
    [('', 'easy', 'movie'), '\xe4\xb8\x96\xe9\x97\xb4\xe5\xae\x89\xe5\xbe\x97\xe5\x8f\x8c\xe5\x85\xa8\xe6\xb3\x95,\xe4\xb8\x8d\xe8\xb4\x9f\xe5\xa6\x82\xe6\x9d\xa5\xe4\xb8\x8d\xe8\xb4\x9f\xe5\x8d\xbf!']
    [('', 'easy', 'music'), 'ZARD\xe6\xb6\x88\xe9\x80\x9d\xe4\xba\x86']
 
    for entry in entrise:
        e = Pyblosxom.entries.fileentry.FileEntry(request, entry[2], entry[1])
        #print e['title']
        #print entry[1].split("/")[:-1]
        body += '<span class="indents">%s</span><a href="%s%s.html">%s</a>%s<br>\n'%(
                "...."*len(entry[1].split("/"))
                ,_baseurl
                ,entry[1]
                ,e['title'] #entry[0]
                ,entry[1]
                )
    '''
    #print body
    body +="</div>"
    data = {'title': config.get('cindex_title', 'index')}
    # use the epoch for mtime. otherwise, pyblosxom uses the current time, which
    # makes other plugins (like weblogsping) think this is a new entry.
    epoch = time.localtime(0)
    fe = Pyblosxom.entries.base.generate_entry(request, data, body, epoch)
    return [fe]
 
def cb_story(args):
  request = args['request']
  http = request.getHttp()
  config = request.getConfiguration()
  trigger = config.get('cindex_trigger', 'site-index')
 
  if (http['PATH_INFO'] == trigger and
      not config.get('cindex_use_story_template', 1)):
    title = config.get('cindex_title', 'index')
    args['template'] = '<h1 class="index">%s</h1>\n<hr>\n$body' % title
 
  return args
</zoomquiet+pyb></[email protected]>

§ 写于: Mon, 31 May 2010 | 永久链接;源文: rdf ,rss ,raw | 分类: /techic/PyBlosxom/plugins §
[MailMe] [Print] Creative Commons License

作品Zoom.Quiet创作,采用知识共享署名-相同方式共享 2.5 中国大陆许可协议进行许可。 基于zoomquiet.org上的作品创作。

¶ IOP:实践之一

2010-05-13 19:19

1. 背景

什么事儿呢?

  • 在一高压力服务环境中,需要加速系统的响应
  • 现行系统对于数据查询要尝试三种数据库源:
    1. memcache
    2. redis
    3. MySQL
  • 期望查询加速至少 300%
运行环境
客户机:
    - 俺的Laptop HP 520
    - 双核CPU 2G内存
    - Ubuntu 9.10
DB主机:
    - CentOS 5.0
    - 单核CPU 4G 内存

1.1. 预案

这事儿,一想,简单哪,都放内存就好的哪

  • 统计了一下需要用来查询的数据不到2千万条
  • 折算成文件不过1G
  • 加载到内存中,使用 Python 字典结构的话,也最多涨一倍,也完全可以接受
  • 速度?!

2. IOP的加速技巧

没有想到,加速,只要不断将代码住短里面写就好!

不知道什么是 IOP?

  • PyIOP
  • 咔咔咔,沈游侠总结的编程态度>...

2.1. 10万:170+4秒

最直接的实现
  • 从redis 读
  • 生成 dict 对象
  • 以pickle dump 出序列化文件
  • 用pickle load 加载成dict对象

2.1.1. code

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/python
# -*- coding: utf-8 -*-
import struct,sys,time
import cPickle as pickle
import redis
REVERSION = "r2d.py v10.5.7"
def _push2dict(dictall,key,smembers):
    dictall[struct.pack('I',int(key[1:]))]=[s.split("|") for s in rb.smembers(k)]   
    return dictall
if __name__ == '__main__':      # this way the module can be
    if 3 != len(sys.argv):
        print """ %s usage::
$ python r2d.py redistIP limitnumber [like 10000]
        """ % REVERSION
    else:
        hostIP = sys.argv[1]
        limitn = sys.argv[2]
        rb = redis.Redis(host=hostIP, port=6379, db=9)
        rbkeys = rb.keys().split()
        loop = int(limitn)
        s4dict={}
        for k in rbkeys:
            if 0 == loop:
                break
            else:
                loop -=1
                _push2dict(s4dict,k,rb.smembers(k))
        pickle.dump(s4dict, open('r4d.dump', 'wb'))

2.1.2. speed

  • 代码足够简单了,单函式,20行
  • 速度测试::
    • 10W 值对导出 >170秒,导入>4秒;
    • 100W 值对导出 >500秒,导入>25秒;
    • 1000W 值对导出 失败! 内存提前耗光!

2.1.3. improve

这完全无法接受哪...

加速尝试::
  1. 根据 IOP 尽量不用函式,将那个一行函式清除,代码填回循环 ~ 立即获得几秒的加速
  2. 本来用的就是cPickle 了,模块效率没有办法了
  3. 嗯嗯嗯,可以不用 cPickle卟?
    • 直接输出自然 .py 哈?!
1
2
3
4
5
6
# 使用 str() 将字典对象用文本的方式记入 .py
vdf = open("r2d.define.py","wa")
vdf.write("s4dict=")
vdf.write(str(s4dict))
vdf.close()
# 使用时直接 import 就好

  • 改进后测试::
    • 10W 值对导出 >160秒,导入>3秒;
    • 100W 值对导出 >400秒,导入>19秒;
    • 1000W 值对导出 依然杯具
  • 加速不明显:
    • 导出时速度变化很小
    • 载入时速度有提升
  • 进一步观察到,导出时内存飞速增长:
    • 100W级别,要食掉1.6G左右的内存
    • redis 本身也很占内存,千万级别时,也要占上G (当然这和条目数量/内容有关)
    • 这对于服务器是个不可接受的方式

2.2. 10万:150+10秒

改进中间数据格式:
  • 从redis 读
  • 生成 中间log文件
  • 导入成 字典对象

2.2.1. thinking

因为有这些现实:

  1. 想输出不论 pickle 或是 .py 的字典对象,都得先在内存中构造出这一对象
  2. 随着字典对象的规模增加,必然导致这一构建过程的时间加长

经沈游侠提醒,发现字典对象其实是可以线性输出的:

  • 比如说,字典结构如:
    {key:[(v1,v2,v3),..]
    ,...
    }
    
    • K:[list] 形式的两层结构
  • 那么,就可以通过中间数据文本:
    ('key', [(v1, 'v2', v3)])
    ...
    
  • 进行线性加载,e.g:
    1
    2
    3
    4
    5
    6
    for l in open("r2d.define.py.log","r").readlines():
        dd = eval(l)
        if dd[0] in s4d:
            s4d[dd[0]].append((dd[1],dd[2],dd[3]))
        else:
            s4d[dd[0]]=[(dd[1],dd[2],dd[3])]

2.2.2. code

快速修订对应行动代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/python
# -*- coding: utf-8 -*-
import struct,sys,time
import redis
REVERSION = "r2d.py v10.5.8"
if __name__ == '__main__':      # this way the module can be
    if 3 != len(sys.argv):
        print """ %s usage::
$ python r2d.py redistIP limitnumber [like 10000] > mid-data.log
        """ % REVERSION
    else:
        hostIP = sys.argv[1]
        limitn = sys.argv[2]
        rb = redis.Redis(host=hostIP, port=6379, db=9)
        rbkeys = rb.keys().split()
        loop = int(limitn)
        for k in rbkeys:
            if 0 == loop:
                break
            else:
                loop -=1
                dictkey = struct.pack('I',int(k[1:]))
                sli = []
                for s in rb.smembers(k):
                    rli = s.split("|")[:3]
                    if rli:
                        rli[0] = int(rli[0])
                        rli[1] = struct.pack('I',int(rli[1]))
                        rli[2] = int(rli[2])
                        sli.append(tuple(rli))
                print >> d2f,`dictkey,sli`
                #注意: `obj` 等同 repr(obj) 

2.2.3. improve

代码依然简单了,无函式,30行

  • 速度测试::
    • 10W 值对导出 >150秒,导入>10秒;
    • 100W 值对导出 >500秒,导入>20秒;
    • 1000W 值对导出 >1300秒,导入>90秒;
  • 速度看起来没有什么明显的提高
  • 不过!
    1. 千万级别的数据可以在低配置环境中跑完了!
    2. 内存占用很稳定永远90M左右,不会随字典对象的增长而增长!

嗯嗯嗯,这算是可用了...

2.3. 10万:100+200秒?!

继续改进:
  • 放弃 redis 直接从MySQL 读
  • 生成 中间日志
  • 导入成 dict 文件

2.3.1. thinking

虽然redis 是号称最快的 K/V 数据库产品,但是,明显就是它将整个业务响应速度拖慢了..

为什么呢?
  • 服务器程序和本地程序面对的环境是不同的
  • 高压力服务器程序和小压力服务器程序也是不同的
  • 简单来说:
    1. 小型服务~=每秒<C60
    2. 中型服务~=每秒<C600
    3. 大型服务~=每秒>C1000
  • 面对的矛盾是完全不同的:
    1. 小型服务->语言执行效率
    2. 中型服务->框架执行效率
    3. 大型服务->I/O 响应速度
  • 所以,对于面向Web 的查询服务,不论 Redis/MySQL 对于业务系统,都是进程间通讯!
  • 每次跨进程通讯,都意味着至少四次I/O操作!
  • 所以,当前 Redis->log->内存字典的转换流程,其实包含了 MySQL->Redis 的进程操作
  • 另外:
    • 遍查 Redis 文档,居然没有 iterkeys() 类似的操作!
    • 每次不论转换多少 Redis 的值对,都得使用keys() 将键先取出来然后再逐一匹配处理
    • 怪不得使用 Redis 测试用小仓库(包含20万值对)时,脚本运行速度和使用全数据Redis(千万值对)时速度要相差5倍以上!内存也占用多几倍!

所以!要直接从 MySQL 相关表中读取

2.3.2. code

配合一SQL 模板:

1
2
-- _tpl/all_black.tpl
SELECT v1,v2,v3,v4,id FROM t_black LIMIT %(limitMAX)s;

核心代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/python
# -*- coding: utf-8 -*-
REVERSION = "m2d.py v10.5.9"
import struct,sys,time
 
if __name__ == '__main__':      # this way the module can be
    """usage:
$ python m2d.py limit [such as 100] |\
  mysql -h xx.xx.xx.xx -u User -D --password=***  |\
  python m2d.py > m2d.log
    """
    if sys.stdin:
        if 1 < len(sys.argv):
            limit = sys.argv[1]
            limitMAX = int(limit)
            print >> sys.stderr, info
            print open("_tpl/all_black.tpl","r").read()%locals()
        else:
            s4dict={}
            virusname={}
            totalN = 0
            for l in sys.stdin.readlines():
                if "id" in l:
                    pass
                else:
                    totalN += 1
                    lkv=l.split()
                    dictkey = struct.pack('I',int(lkv[0]))
                    lkv[1] = int(lkv[1])
                    lkv[2] = struct.pack('I',int(lkv[2]))
                    lkv[3] = int(lkv[3])
                    print >> d2f,`dictkey,tuple(lkv[1:])`
  • 用是否有额外参数来判定是否生成SQL,还是进行MySQL 的输出数据处理
  • 为了不影响标准输出,调试信息,输出到标准错误IO
  • 调试也应该根据管道串的层级,一级级运行尝试
  • 调用的shell:
    1
    2
    3
    4
    5
    6
    7
    #!/bin/sh
    #   m2d.sh v10.5.9
    python m2d.py go |\
        mysql -h xx.xx.xx.xx -u User --password=***|\
        python mysql4dict.py $1 |\
        mysql -h xx.xx.xx.xx -u User --password=***|\
        python m2d.py > m2d.log

    加载时代码没有怎么变

2.3.3. speed

  • 速度测试::
    • 0.1W 值对导出 <5秒,导入<1秒;
    • 10W 值对导出 >300秒,导入>10秒;
    • 100W 值对导出 杯具鸟
    • 1000W 值对导出 杯具鸟
  • 速度怎么和查询的条目多少有关? 而且一大MySQL 就拒绝服务?

2.3.4. improve

嗯嗯嗯,FT! 当然了,MySQL 请求响应时间是有限制的,大数据传输肯定是有问题的, 利用 LIMIT 的切片!

  • 改造前述m2d.py v10.5.9 SQL生成部分的代码:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    step = 5000.0
    limitMAX = int(limit)
    sqltpl = open("_tpl/all_black.tpl","r").read()
    if 1 >= limitMAXb/step:
        print sqltpl%locals()
    else:
        for i in range(int(limitMAX/step)):
            limitMAX = "%d,%d"%(step*i,int(step))
            print sqltpl%locals()
  • 以 5000 为界限,生成类似 LIMIT 15000,5000 的限制
  • 速度测试::
    • 100W 值对导出 >290秒,导入>190秒;
    • 1000W 值对导出 杯具鸟

FT!怎么回事儿?速度意外的慢!

2.4. 1000万:1100秒+480秒

冷静后改进:
  • 从MySQL 读使用id 进行限制切分
  • 生成 中间日志,但是重新设计结构
  • 导入成 dict 文件

2.4.1. thinking

MySQL 为什么这么慢?!

2.4.2. code

SQL 模板配合改进:

1
2
-- _tpl/all_black.tpl
SELECT v1,v2,v3,v4,id FROM t_black WHERE id>%(LIMbwID)s AND id<=%(MAXbwID)s;

SQL 生成代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
step = 3000.0
offset = int(step)
# < <gen_sql_with_max> > 使用Leo 时可以定义子节点将成堆代码变成语义标记
for l in sys.stdin.readlines():
    if "max(id)" not in l:
        amount = l.split()
        MAXbwID = int(amount[0]
        MINbwID = int(amount[1]
        MAXbwA = MAXbID-MINbID
MAXbMAX = MAXbwID
lastID = 0
sqltpl = open("_tpl/all_black.tpl","r").read()
for i in range(MAXbwA/offset):
    MAXbwID,LIMbwID = (MAXbMAX-offset*i,MAXbMAX-offset*(i+1))
    print sqltpl%locals()
    lastID = LIMbID
MAXbID,LIMbID = lastID,MINbID   # 将切片限数之内的尾数个ID也查询出来
print sqltpl%locals()
  • 咔咔咔,当然的,要进行基于ID 的精确切分选择,就得先知道最大和最小的ID
  • 使用SQL SELECT max(id),min(id) FROM t_black ;
  • 自然运用系统管道串接成处理过程:
    1
    2
    3
    4
    5
    6
    7
    #!/bin/sh
    #   m2d.sh v10.5.9
    DATE=`date "+%y%m%d-%H%M%S"`
    mysql -h xx.xx.xx -u User --password=*** < _tpl/total_black.sql |\
        python m2d.py go |\
        mysql -h xx.xx.xx -u User --password=*** |\
        python m2d.py > m2d_$DATE.log

2.4.3. improve

  • 速度测试::
    • 100W 值对导出 >280秒,导入>120秒;
      • m2d.log > 150M
    • 1000W 值对导出 >1300秒,导入>780秒;
      • m2d.log > 570M

进一步的,发现业务其实可以将双层,两次查询优化成一次查询的!

  • 简单的将输出字典的数据结构变成:
    {key:v,...
    }
    
  • 将原先的2个值结合原先的key 变成键,就成为了全局唯一的key
  • 即和MySQL 每行数据完成一一对应
  • 速度再测试::
    • 100W 值对导出 >250秒,导入>100秒;
      • m2d.log > 130M
    • 1000W 值对导出 >1100秒,导入>480秒;
      • m2d.log > 490M
  • 哗! 而且加载完后,字典对象所点内存体积也同样减少了 30% !

2.5. 小记

综上:
  • 应用 IOP 方面的主要招术:
    1. 表用 函式
    2. 表用 模块
    3. 尽量使用 OS 的标准 I/O 进行功能串接
  • DOP~Data Oriented Programming
    • 面向数据编程
    • 归根到底,程序都得操作数据解决问题
    • 服务器端,大并发压力时,最有效的节省I/O 的方式,就是高效方式
    • 压缩输入/出的数据量自然是最好的代码!

3. 提速100倍

嗯嗯嗯?!怎么说到最后好象也没有加速到100倍哪?

  • 是也乎,是也乎,以上分享的是内存化字典数据的转换脚本加速过程
  • 但是最后要加速的是整个查询业务哈?
  • 因为涉及公司核心服务,代码就无法展示了
  • 不过,现实是:
    1. 使用了以上 IOP 中提及的基础技巧
    2. 每查询业务的处理时间从原先的 0.* 秒,加速到 0.00*秒,至少100倍
    3. 而对应的代码从 几千行,精简到几十行

所以,基本上可以这么理解:

  • 将代码每精简一倍体积
  • 运行速度就有望提高10倍

    不相信? 尝试一哈噻...

  • t2t渲染:: 2010-10-09 02:21:36
  • 动力源自::txt2tags

§ 写于: Thu, 13 May 2010 | 永久链接;源文: rdf ,rss ,raw | 分类: /Zen/IOP §
[MailMe] [Print] Creative Commons License

作品Zoom.Quiet创作,采用知识共享署名-相同方式共享 2.5 中国大陆许可协议进行许可。 基于zoomquiet.org上的作品创作。