import json
import os, re, sys, subprocess, platform
import tarfile
from distutils import log
from contextlib import closing, contextmanager
from ftplib import FTP
try:
from urllib.parse import urljoin, unquote, urlparse
from urllib.request import urlretrieve, urlopen, urlcleanup, Request
except ImportError: # Py2
from urlparse import urljoin, unquote, urlparse
from urllib import urlretrieve, urlcleanup
from urllib2 import urlopen, Request
multi_make_options = []
try:
import multiprocessing
cpus = multiprocessing.cpu_count()
if cpus > 1:
if cpus > 5:
cpus = 5
multi_make_options = ['-j%d' % (cpus+1)]
except:
pass
# overridable to control script usage
sys_platform = sys.platform
# use pre-built libraries on Windows
def download_and_extract_windows_binaries(destdir):
url = "https://api.github.com/repos/lxml/libxml2-win-binaries/releases"
releases, _ = read_url(url, accept="application/vnd.github+json", as_json=True)
max_release = {'tag_name': ''}
for release in releases:
if max_release['tag_name'] < release.get('tag_name', ''):
max_release = release
url = "https://github.com/lxml/libxml2-win-binaries/releases/download/%s/" % max_release['tag_name']
filenames = [asset['name'] for asset in max_release.get('assets', ())]
# Check for native ARM64 build or the environment variable that is set by
# Visual Studio for cross-compilation (same variable as setuptools uses)
if platform.machine() == 'ARM64' or os.getenv('VSCMD_ARG_TGT_ARCH') == 'arm64':
arch = "win-arm64"
elif sys.maxsize > 2**32:
arch = "win64"
else:
arch = "win32"
if sys.version_info < (3, 5):
arch = 'vs2008.' + arch
libs = {}
for libname in ['libxml2', 'libxslt', 'zlib', 'iconv']:
libs[libname] = "%s-%s.%s.zip" % (
libname,
find_max_version(libname, filenames),
arch,
)
if not os.path.exists(destdir):
os.makedirs(destdir)
for libname, libfn in libs.items():
srcfile = urljoin(url, libfn)
destfile = os.path.join(destdir, libfn)
if os.path.exists(destfile + ".keep"):
print('Using local copy of "{}"'.format(srcfile))
else:
print('Retrieving "%s" to "%s"' % (srcfile, destfile))
urlcleanup() # work around FTP bug 27973 in Py2.7.12+
urlretrieve(srcfile, destfile)
d = unpack_zipfile(destfile, destdir)
libs[libname] = d
return libs
def find_top_dir_of_zipfile(zipfile):
topdir = None
files = [f.filename for f in zipfile.filelist]
dirs = [d for d in files if d.endswith('/')]
if dirs:
dirs.sort(key=len)
topdir = dirs[0]
topdir = topdir[:topdir.index("/")+1]
for path in files:
if not path.startswith(topdir):
topdir = None
break
assert topdir, (
"cannot determine single top-level directory in zip file %s" %
zipfile.filename)
return topdir.rstrip('/')
def unpack_zipfile(zipfn, destdir):
assert zipfn.endswith('.zip')
import zipfile
print('Unpacking %s into %s' % (os.path.basename(zipfn), destdir))
f = zipfile.ZipFile(zipfn)
try:
extracted_dir = os.path.join(destdir, find_top_dir_of_zipfile(f))
f.extractall(path=destdir)
finally:
f.close()
assert os.path.exists(extracted_dir), 'missing: %s' % extracted_dir
return extracted_dir
def get_prebuilt_libxml2xslt(download_dir, static_include_dirs, static_library_dirs):
assert sys_platform.startswith('win')
libs = download_and_extract_windows_binaries(download_dir)
for libname, path in libs.items():
i = os.path.join(path, 'include')
l = os.path.join(path, 'lib')
assert os.path.exists(i), 'does not exist: %s' % i
assert os.path.exists(l), 'does not exist: %s' % l
static_include_dirs.append(i)
static_library_dirs.append(l)
## Routines to download and build libxml2/xslt from sources:
LIBXML2_LOCATION = 'https://download.gnome.org/sources/libxml2/'
LIBXSLT_LOCATION = 'https://download.gnome.org/sources/libxslt/'
LIBICONV_LOCATION = 'https://ftp.gnu.org/pub/gnu/libiconv/'
ZLIB_LOCATION = 'https://zlib.net/'
match_libfile_version = re.compile('^[^-]*-([.0-9-]+)[.].*').match
def _find_content_encoding(response, default='iso8859-1'):
from email.message import Message
content_type = response.headers.get('Content-Type')
if content_type:
msg = Message()
msg.add_header('Content-Type', content_type)
charset = msg.get_content_charset(default)
else:
charset = default
return charset
def remote_listdir(url):
try:
return _list_dir_urllib(url)
except IOError:
assert url.lower().startswith('ftp://')
print("Requesting with urllib failed. Falling back to ftplib. "
"Proxy argument will be ignored for %s" % url)
return _list_dir_ftplib(url)
def _list_dir_ftplib(url):
parts = urlparse(url)
ftp = FTP(parts.netloc)
try:
ftp.login()
ftp.cwd(parts.path)
data = []
ftp.dir(data.append)
finally:
ftp.quit()
return parse_text_ftplist("\n".join(data))
def read_url(url, decode=True, accept=None, as_json=False):
headers = {'User-Agent': 'https://github.com/lxml/lxml'}
if accept:
headers['Accept'] = accept
request = Request(url, headers=headers)
with closing(urlopen(request)) as res:
charset = _find_content_encoding(res)
content_type = res.headers.get('Content-Type')
data = res.read()
if decode:
data = data.decode(charset)
if as_json:
data = json.loads(data)
return data, content_type
def _list_dir_urllib(url):
data, content_type = read_url(url)
if content_type and content_type.startswith('text/html'):
files = parse_html_filelist(data)
else:
files = parse_text_ftplist(data)
return files
def http_find_latest_version_directory(url, version=None):
data, _ = read_url(url)
# e.g.
directories = [
(int(v[0]), int(v[1]))
for v in re.findall(r' href=["\']([0-9]+)\.([0-9]+)/?["\']', data)
]
if not directories:
return url
best_version = max(directories)
if version:
major, minor, _ = version.split(".", 2)
major, minor = int(major), int(minor)
if (major, minor) in directories:
best_version = (major, minor)
latest_dir = "%s.%s" % best_version
return urljoin(url, latest_dir) + "/"
def http_listfiles(url, re_pattern):
data, _ = read_url(url)
files = re.findall(re_pattern, data)
return files
def parse_text_ftplist(s):
for line in s.splitlines():
if not line.startswith('d'):
# -rw-r--r-- 1 ftp ftp 476 Sep 1 2011 md5sum.txt
# Last (9th) element is 'md5sum.txt' in the above example, but there
# may be variations, so we discard only the first 8 entries.
yield line.split(None, 8)[-1]
def parse_html_filelist(s):
re_href = re.compile(
r''']*\shref=["']([^;?"']+?)[;?"']''',
re.I|re.M)
links = set(re_href.findall(s))
for link in links:
if not link.endswith('/'):
yield unquote(link)
def tryint(s):
try:
return int(s)
except ValueError:
return s
@contextmanager
def py2_tarxz(filename):
import tempfile
with tempfile.TemporaryFile() as tmp:
subprocess.check_call(["xz", "-dc", filename], stdout=tmp.fileno())
tmp.seek(0)
with closing(tarfile.TarFile(fileobj=tmp)) as tf:
yield tf
def download_libxml2(dest_dir, version=None):
"""Downloads libxml2, returning the filename where the library was downloaded"""
#version_re = re.compile(r'LATEST_LIBXML2_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)')
version_re = re.compile(r'libxml2-([0-9.]+[0-9]).tar.xz')
filename = 'libxml2-%s.tar.xz'
if version == "2.9.12":
# Temporarily using the latest master (2.9.12+) until there is a release that supports lxml again.
from_location = "https://gitlab.gnome.org/GNOME/libxml2/-/archive/dea91c97debeac7c1aaf9c19f79029809e23a353/"
version = "dea91c97debeac7c1aaf9c19f79029809e23a353"
else:
from_location = http_find_latest_version_directory(LIBXML2_LOCATION, version=version)
return download_library(dest_dir, from_location, 'libxml2',
version_re, filename, version=version)
def download_libxslt(dest_dir, version=None):
"""Downloads libxslt, returning the filename where the library was downloaded"""
#version_re = re.compile(r'LATEST_LIBXSLT_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)')
version_re = re.compile(r'libxslt-([0-9.]+[0-9]).tar.xz')
filename = 'libxslt-%s.tar.xz'
from_location = http_find_latest_version_directory(LIBXSLT_LOCATION, version=version)
return download_library(dest_dir, from_location, 'libxslt',
version_re, filename, version=version)
def download_libiconv(dest_dir, version=None):
"""Downloads libiconv, returning the filename where the library was downloaded"""
version_re = re.compile(r'libiconv-([0-9.]+[0-9]).tar.gz')
filename = 'libiconv-%s.tar.gz'
return download_library(dest_dir, LIBICONV_LOCATION, 'libiconv',
version_re, filename, version=version)
def download_zlib(dest_dir, version):
"""Downloads zlib, returning the filename where the library was downloaded"""
version_re = re.compile(r'zlib-([0-9.]+[0-9]).tar.gz')
filename = 'zlib-%s.tar.gz'
return download_library(dest_dir, ZLIB_LOCATION, 'zlib',
version_re, filename, version=version)
def find_max_version(libname, filenames, version_re=None):
if version_re is None:
version_re = re.compile(r'%s-([0-9.]+[0-9](?:-[abrc0-9]+)?)' % libname)
versions = []
for fn in filenames:
match = version_re.search(fn)
if match:
version_string = match.group(1)
versions.append((tuple(map(tryint, version_string.split('.'))),
version_string))
if not versions:
raise Exception(
"Could not find the most current version of %s from the files: %s" % (
libname, filenames))
versions.sort()
version_string = versions[-1][-1]
print('Latest version of %s is %s' % (libname, version_string))
return version_string
def download_library(dest_dir, location, name, version_re, filename, version=None):
if version is None:
try:
if location.startswith('ftp://'):
fns = remote_listdir(location)
else:
print(location)
fns = http_listfiles(location, '(%s)' % filename.replace('%s', '(?:[0-9.]+[0-9])'))
version = find_max_version(name, fns, version_re)
except IOError:
# network failure - maybe we have the files already?
latest = (0,0,0)
fns = os.listdir(dest_dir)
for fn in fns:
if fn.startswith(name+'-'):
match = match_libfile_version(fn)
if match:
version_tuple = tuple(map(tryint, match.group(1).split('.')))
if version_tuple > latest:
latest = version_tuple
filename = fn
version = None
if latest == (0,0,0):
raise
if version:
filename = filename % version
full_url = urljoin(location, filename)
dest_filename = os.path.join(dest_dir, filename)
if os.path.exists(dest_filename):
print(('Using existing %s downloaded into %s '
'(delete this file if you want to re-download the package)') % (
name, dest_filename))
else:
print('Downloading %s into %s from %s' % (name, dest_filename, full_url))
urlcleanup() # work around FTP bug 27973 in Py2.7.12
urlretrieve(full_url, dest_filename)
return dest_filename
def unpack_tarball(tar_filename, dest):
print('Unpacking %s into %s' % (os.path.basename(tar_filename), dest))
if sys.version_info[0] < 3 and tar_filename.endswith('.xz'):
# Py 2.7 lacks lzma support
tar_cm = py2_tarxz(tar_filename)
else:
tar_cm = closing(tarfile.open(tar_filename))
base_dir = None
with tar_cm as tar:
for member in tar:
base_name = member.name.split('/')[0]
if base_dir is None:
base_dir = base_name
elif base_dir != base_name:
print('Unexpected path in %s: %s' % (tar_filename, base_name))
tar.extractall(dest)
return os.path.join(dest, base_dir)
def call_subprocess(cmd, **kw):
import subprocess
cwd = kw.get('cwd', '.')
cmd_desc = ' '.join(cmd)
log.info('Running "%s" in %s' % (cmd_desc, cwd))
returncode = subprocess.call(cmd, **kw)
if returncode:
raise Exception('Command "%s" returned code %s' % (cmd_desc, returncode))
def safe_mkdir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
def cmmi(configure_cmd, build_dir, multicore=None, **call_setup):
print('Starting build in %s' % build_dir)
call_subprocess(configure_cmd, cwd=build_dir, **call_setup)
if not multicore:
make_jobs = multi_make_options
elif int(multicore) > 1:
make_jobs = ['-j%s' % multicore]
else:
make_jobs = []
call_subprocess(
['make'] + make_jobs,
cwd=build_dir, **call_setup)
call_subprocess(
['make'] + make_jobs + ['install'],
cwd=build_dir, **call_setup)
def configure_darwin_env(env_setup):
import platform
# configure target architectures on MacOS-X (x86_64 only, by default)
major_version, minor_version = tuple(map(int, platform.mac_ver()[0].split('.')[:2]))
if major_version > 7:
if platform.mac_ver()[2] == "arm64":
env_default = {
'CFLAGS': "-arch arm64 -O2",
'LDFLAGS': "-arch arm64",
'MACOSX_DEPLOYMENT_TARGET': "10.6"
}
else:
env_default = {
'CFLAGS': "-arch x86_64 -O2",
'LDFLAGS': "-arch x86_64",
'MACOSX_DEPLOYMENT_TARGET': "10.6"
}
env_default.update(os.environ)
env_setup['env'] = env_default
def build_libxml2xslt(download_dir, build_dir,
static_include_dirs, static_library_dirs,
static_cflags, static_binaries,
libxml2_version=None,
libxslt_version=None,
libiconv_version=None,
zlib_version=None,
multicore=None):
safe_mkdir(download_dir)
safe_mkdir(build_dir)
zlib_dir = unpack_tarball(download_zlib(download_dir, zlib_version), build_dir)
libiconv_dir = unpack_tarball(download_libiconv(download_dir, libiconv_version), build_dir)
libxml2_dir = unpack_tarball(download_libxml2(download_dir, libxml2_version), build_dir)
libxslt_dir = unpack_tarball(download_libxslt(download_dir, libxslt_version), build_dir)
prefix = os.path.join(os.path.abspath(build_dir), 'libxml2')
lib_dir = os.path.join(prefix, 'lib')
safe_mkdir(prefix)
lib_names = ['libxml2', 'libexslt', 'libxslt', 'iconv', 'libz']
existing_libs = {
lib: os.path.join(lib_dir, filename)
for lib in lib_names
for filename in os.listdir(lib_dir)
if lib in filename and filename.endswith('.a')
} if os.path.isdir(lib_dir) else {}
def has_current_lib(name, build_dir, _build_all_following=[False]):
if _build_all_following[0]:
return False # a dependency was rebuilt => rebuilt this lib as well
lib_file = existing_libs.get(name)
found = lib_file and os.path.getmtime(lib_file) > os.path.getmtime(build_dir)
if found:
print("Found pre-built '%s'" % name)
else:
# also rebuild all following libs (which may depend on this one)
_build_all_following[0] = True
return found
call_setup = {}
if sys_platform == 'darwin':
configure_darwin_env(call_setup)
configure_cmd = ['./configure',
'--disable-dependency-tracking',
'--disable-shared',
'--prefix=%s' % prefix,
]
# build zlib
zlib_configure_cmd = [
'./configure',
'--prefix=%s' % prefix,
]
if not has_current_lib("libz", zlib_dir):
cmmi(zlib_configure_cmd, zlib_dir, multicore, **call_setup)
# build libiconv
if not has_current_lib("iconv", libiconv_dir):
cmmi(configure_cmd, libiconv_dir, multicore, **call_setup)
# build libxml2
libxml2_configure_cmd = configure_cmd + [
'--without-python',
'--with-iconv=%s' % prefix,
'--with-zlib=%s' % prefix,
]
if not libxml2_version:
libxml2_version = os.path.basename(libxml2_dir).split('-', 1)[-1]
if tuple(map(tryint, libxml2_version.split('-', 1)[0].split('.'))) >= (2, 9, 5):
libxml2_configure_cmd.append('--without-lzma') # can't currently build that
try:
if tuple(map(tryint, libxml2_version.split('-', 1)[0].split('.'))) >= (2, 7, 3):
libxml2_configure_cmd.append('--enable-rebuild-docs=no')
except Exception:
pass # this isn't required, so ignore any errors
if not has_current_lib("libxml2", libxml2_dir):
if not os.path.exists(os.path.join(libxml2_dir, "configure")):
# Allow building from git sources by running autoconf etc.
libxml2_configure_cmd[0] = "./autogen.sh"
cmmi(libxml2_configure_cmd, libxml2_dir, multicore, **call_setup)
# Fix up libxslt configure script (needed up to and including 1.1.34)
# https://gitlab.gnome.org/GNOME/libxslt/-/commit/90c34c8bb90e095a8a8fe8b2ce368bd9ff1837cc
with open(os.path.join(libxslt_dir, "configure"), 'rb') as f:
config_script = f.read()
if b' --libs print ' in config_script:
config_script = config_script.replace(b' --libs print ', b' --libs ')
with open(os.path.join(libxslt_dir, "configure"), 'wb') as f:
f.write(config_script)
# build libxslt
libxslt_configure_cmd = configure_cmd + [
'--without-python',
'--with-libxml-prefix=%s' % prefix,
'--without-crypto',
]
if not (has_current_lib("libxslt", libxslt_dir) and has_current_lib("libexslt", libxslt_dir)):
cmmi(libxslt_configure_cmd, libxslt_dir, multicore, **call_setup)
# collect build setup for lxml
xslt_config = os.path.join(prefix, 'bin', 'xslt-config')
xml2_config = os.path.join(prefix, 'bin', 'xml2-config')
static_include_dirs.extend([
os.path.join(prefix, 'include'),
os.path.join(prefix, 'include', 'libxml2'),
os.path.join(prefix, 'include', 'libxslt'),
os.path.join(prefix, 'include', 'libexslt')])
static_library_dirs.append(lib_dir)
listdir = os.listdir(lib_dir)
static_binaries += [os.path.join(lib_dir, filename)
for lib in lib_names
for filename in listdir
if lib in filename and filename.endswith('.a')]
return xml2_config, xslt_config
def main():
static_include_dirs = []
static_library_dirs = []
download_dir = "libs"
if sys_platform.startswith('win'):
return get_prebuilt_libxml2xslt(
download_dir, static_include_dirs, static_library_dirs)
else:
return build_libxml2xslt(
download_dir, 'build/tmp',
static_include_dirs, static_library_dirs,
static_cflags=[],
static_binaries=[]
)
if __name__ == '__main__':
if len(sys.argv) > 1:
# change global sys_platform setting
sys_platform = sys.argv[1]
main()