LDA

Parameters

Example Codes

pip install -r examples/requirements.txt
  • Download and preprocess data

import os
from os.path import join as pjoin
import subprocess

import wget

DATASET = "nytimes"
DIR_PATH = "./res"
BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \
          "bag-of-words/"


# download docword
filename = f"docword.{DATASET}.txt.gz"
out_path = pjoin(DIR_PATH, filename)
wget.download(BASE_URL + filename, out=out_path)
print()

# decompress
cmd = ["gunzip", "-c", out_path, ">",
       pjoin(DIR_PATH, f"docword.{DATASET}.txt")]
cmd = " ".join(cmd)
subprocess.call(cmd, shell=True)
os.remove(pjoin(DIR_PATH, filename))

# download vocab
filename = f"vocab.{DATASET}.txt"
out_path = pjoin(DIR_PATH, filename)
wget.download(BASE_URL + filename, out=out_path)
print()
  • Train cusim word2vec

from cusim import CuLDA

data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt")
keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt")
processed_data_path = pjoin(DIR_PATH, f"docword.{DATASET}.h5")
opt = {
  "data_path": data_path,
  "processed_data_path": processed_data_path,
  "keys_path": keys_path,
  "num_topics": 50,
  "num_iters_in_e_step": 10,
  "reuse_gamma": True,
  # "skip_preprocess": os.path.exists(processed_data_path),
}
start = time.time()
lda = CuLDA(opt)
lda.train_model()
  • Save and evaluate model

h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5")
lda.save_h5_model(h5_model_path)

h5f = h5py.File(h5_model_path, "r")
beta = h5f["beta"][:, :].T
keys = h5f["keys"][:]
topk = 10

for idx in range(beta.shape[0]):
  print("=" * 50)
  print(f"topic {idx + 1}")
  print("-" * 50)
  _beta = beta[idx, :]
  indices = np.argsort(-_beta)[:topk]
  for rank, wordid in enumerate(indices):
    word = keys[wordid].decode("utf8")
    prob = _beta[wordid]
    print(f"rank {rank + 1}. {word}: {prob}")

Performance

attr

gensim (8 vpus)

cusim (NVIDIA T4)

training time (sec)

447.376

76.6972