summaryrefslogtreecommitdiff
path: root/tools/apache_count_dist.py
blob: 40b51a9176a2809b9a5e1f618c309c482394bfd0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python
try:
    import psycopg
except ImportError:
    import psycopg2 as psycopg
    psycopg.TimestampFromMx = lambda x: x 

import sys, os, ConfigParser, urlparse
import datetime
import urlparse
from mx.DateTime import DateTime
from mx.DateTime.Timezone import utc_offset
from itertools import chain

from apache_reader import ApacheLogReader
from apache_stats import ApacheDistantLocalStats
from apache_stats import LocalStats
from apache_stats import ApacheLocalStats

def get_cursor(config):
    """Setup database connection."""
    dbname = config.get('database', 'name')
    dbuser = config.get('database', 'user')
    dbpass = config.get('database', 'password')
    dbconn = psycopg.connect(database=dbname, user=dbuser, password=dbpass)
    return dbconn, dbconn.cursor()

def _log(msg):
    print msg
    sys.stdout.flush()

def _dotlog(msg):
    sys.stdout.write(msg)
    sys.stdout.flush()

def main(config_file, logfile):
    """Populate the download counts."""
    # Read config file
    p = ConfigParser.ConfigParser()
    p.read(config_file)

    # Read mirror infos
    mirrors = p.get('mirrors', 'folder')

    # Read server-relative URI prefix
    files_url = urlparse.urlsplit(p.get('webui', 'files_url'))[2]
    # Setup database connection
    dbconn, cursor = get_cursor(p)

    # create a log reader, that filters on files_url
    # build an iterator here with chain and all distant files
    cursor.execute("select * from mirrors")
    def read_distant_stats(mirror, filename):
        mirror_domain = urlparse.urlparse(mirror[0])[1]
        mirror_domain = os.path.join(mirrors, mirror_domain)
        distant_reader = ApacheDistantLocalStats(mirror_domain)
        stat_file_url = '%s/%s/%s' % (mirror[0], mirror[3], filename)
        return distant_reader.read_stats(stat_file_url)

    # it supposes it runs the program at day + 1
    yesterday = datetime.datetime.now() - datetime.timedelta(1)
    filename = yesterday.strftime('%Y-%m-%d.bz2')
    mirrors = [read_distant_stats(mirror, filename) 
               for mirror in cursor.fetchall()]

    logs = chain(*[ApacheLogReader(logfile, files_url)] + mirrors)
    _log('Working with local stats and %d mirror(s)' % len(mirrors))

    # get last http access
    cursor.execute("select value from timestamps where name='http'")
    last_http = cursor.fetchone()[0]
    _log('Last time stamp was : %s' % last_http)

    downloads = {}

    # let's read the logs in the apache file
    for line in logs:
        day = int(line.get('day', yesterday.day))
        month = line.get('month', yesterday.month)
        year = int(line.get('year', yesterday.year))
        hour = int(line.get('hour', 0))
        minute = int(line.get('min', 0))
        sec = int(line.get('sec', 0))
        date = DateTime(year, month, day, hour, minute, sec)
        zone = utc_offset(line.get('zone', 0))
        date = date - zone
        count = int(line.get('count', 1))
        if date < last_http:
            continue
        
        filename = line['filename']
    
        _dotlog('.')
        # see if we have already read the old download count
        if not downloads.has_key(filename):
            cursor.execute("select downloads from release_files "
                           "where filename=%s", (filename,))
            record = cursor.fetchone()
            if not record:
                # No file entry. Could be a .sig file
                continue
            # make sure we're working with a number
            downloads[filename] = record[0] or 0
        # add a download
        downloads[filename] += count

    if downloads != []:

        for filename, count in downloads.items():
            # Update the download counts in the DB
            _log('Updating download count for %s: %s' % (filename, count))
            cursor.execute("update release_files set downloads=%s "
                        "where filename=%s", (count, filename))
        
        # Update the download timestamp
        date = psycopg.TimestampFromMx(datetime.datetime.now())
        cursor.execute("update timestamps set value=%s "
                    "where name='http'", (date,))

        dbconn.commit()

    # now creating the local stats file
    _log('Building local stats file')
    stats = ApacheLocalStats()
    stats_dir = p.get('mirrors', 'local-stats')
    if not os.path.exists(stats_dir):
        raise ValueError('"%s" folder not found (local-stats in config.ini)' \
                    % stats_dir)
    stats_file = os.path.join(stats_dir, filename) 
    stats.build_daily_stats(yesterday.year, yesterday.month, yesterday.day,
                            logfile, stats_file, files_url, 'bz2')


    # now creating the global stats file
    # which is built with the latest database counts
    _log('Building global stats file')
    globalstats_dir = p.get('mirrors', 'global-stats')   
    if not os.path.exists(globalstats_dir):
        raise ValueError('"%s" folder not found (global-stats in config.ini)' \
                % globalstats_dir)
    cursor.execute("select name, filename, downloads from release_files")

    def get_line(files_url):
        for line in cursor:
            data = {}
            data['day'] = yesterday.day
            data['month'] = yesterday.month
            data['year'] = yesterday.year
            data['filename'] = line[1]
            data['useragent'] = 'Unkown' # not stored yet
            data['packagename'] = line[0]
            data['count'] = line[2]
            yield data

    gstats = LocalStats()
    stats_file = os.path.join(globalstats_dir, filename) 
    gstats.build_daily_stats(yesterday.year, yesterday.month, yesterday.day,
                             get_line, stats_file, files_url, 'bz2')


if __name__=='__main__':
    if len(sys.argv) != 3:
        print "Usage: apache_count.py configfile logfile"
        raise SystemExit
    main(*sys.argv[1:])