Browse Source

Add file-ext regex rules, exception handlers.

Add:
    bitshift/crawler/indexer.py
        -add two `try: except: pass` blocks, one to _decode() and another to
        GitIndexer.run(); bad practice, but GitIndexer has numerous unreliable
        moving parts that can throw too many unforseeable exceptions. Only
        current viable option.
        -add file-extension regex ignore rules (for text, markdown, etc. files)
        to _get_tracked_files().
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
f4b28e6178
1 changed files with 20 additions and 11 deletions
  1. +20
    -11
      bitshift/crawler/indexer.py

+ 20
- 11
bitshift/crawler/indexer.py View File

@@ -31,6 +31,10 @@ class GitIndexer(threading.Thread):
""" """


self.repository_queue = repository_queue self.repository_queue = repository_queue

if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR)

super(GitIndexer, self).__init__() super(GitIndexer, self).__init__()


def run(self): def run(self):
@@ -53,7 +57,7 @@ class GitIndexer(threading.Thread):
try: try:
_index_repository(repo["url"], repo["name"], _index_repository(repo["url"], repo["name"],
repo["framework_name"]) repo["framework_name"])
except: # desperate times -- will be modified later
except:
pass pass


class _ChangeDir(object): class _ChangeDir(object):
@@ -110,16 +114,19 @@ def _index_repository(repo_url, repo_name, framework_name):
:type framework_name: str :type framework_name: str
""" """


GIT_CLONE_TIMEOUT = 60
GIT_CLONE_TIMEOUT = 600


with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
return return


with _ChangeDir(repo_name) as repository_dir: with _ChangeDir(repo_name) as repository_dir:
_insert_repository_codelets(repo_url, repo_name, framework_name) _insert_repository_codelets(repo_url, repo_name, framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))


def _insert_repository_codelets(repo_url, repo_name, framework_name): def _insert_repository_codelets(repo_url, repo_name, framework_name):
""" """
@@ -153,11 +160,6 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
commits_meta[filename]["time_created"], commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]) commits_meta[filename]["time_last_modified"])


db.codelets.insert({
"name" : codelet.name,
"authors" : codelet.authors
})

# Database.insert(codelet) # Database.insert(codelet)


def _generate_file_url(filename, repo_url, framework_name): def _generate_file_url(filename, repo_url, framework_name):
@@ -230,6 +232,8 @@ def _get_tracked_files():
""" """


GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
"md(wn|t[e]?xt)?", "rst"]


tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"), -f && -T }' $(find . -type d -name .git -prune -o -print)"),
@@ -239,7 +243,11 @@ def _get_tracked_files():
for filename in tracked_files: for filename in tracked_files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES]) for pattern in GIT_IGNORE_FILES])
if not filename_match:
extension = filename.split(".")[-1]
extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_EXTENSIONS])

if not (filename_match or extension_match):
valuable_files.append(filename[2:]) valuable_files.append(filename[2:])
return valuable_files return valuable_files


@@ -301,7 +309,8 @@ def _decode(raw):
""" """


try: try:
return raw.decode(bs4.BeautifulSoup(raw).original_encoding)
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None


except (UnicodeDecodeError, UserWarning):
except:
return None return None

Loading…
Cancel
Save