Scipyでの疎行列の扱い

2013-09-25

python で疎な文書ベクトルの類似度や距離を計算をするメモ

Scipy で疎行列を使う際の基本的な操作について書いたのが昔なので、どっか間違ってるかも

行列の計算

import scipy.sparse as sp
import numpy as np

a = sp.lil_matrix((1, 10000)) # 1*10000の疎行列が作成される
b = sp.lil_matrix((1, 10000))
# a.shape => (1, 10000)
for i in xrange(a.shape[1]):
	r = np.random.rand()
	if r < 0.9:
		r = 0.0
	a[0, i] = r
# aの各要素にrandomで数値を格納した
a
# => <1x10000 sparse matrix of type '<type 'numpy.float64'>'
        with 947 stored elements in LInked List format>
# bも同様にした

変換

ca = a.tocsr()
ca
# => <1x10000 sparse matrix of type '<type 'numpy.float64'>'
        with 947 stored elements in Compressed Sparse Row format>
#lil => csrとなりました

行列積

# 転置行列
ta = a.T
# 行列の積
print a.dot(ta) # (1,1)の行列だが、これも疎行列で表される
# => (0, 0)        853.19504342

ベクトルの大きさ

v = np.array([[1, 1]])
math.sqrt(np.dot(v, v.T))
# => 1.4142135623730951
np.linalg.norm(v)
# => 1.4142135623730951

np.linalg.norm(a)
# => エラー起きる
np.linalg.norm(a.todense())
np.linalg.norm(a.toarray())
# => 29.209502621916037

#コサイン類似度
import scipy.spatial.distance as dis
dis.cosine(a.todense(), b.todense())
# => 0.91347774109309299

疎行列のユークリッド距離

# -*- encoding: utf-8 -*-

import scipy.spatial.distance as dis
import scipy.sparse as sp
import numpy as np, scipy.io as io
import math

def sparse_distance(v1, v2):
    """1*Nのベクトル間のユークリッド距離を求める
    args:
        v1, v2 : 1 * N の(疎)行列
    """
    if not sp.issparse(v1) or not sp.issparse(v2):
        # 疎な行列でなければ組み込みのeuclideanを使う
        if v1.size != v2.size:
            raise ValueError
        return dis.euclidean(v1, v2)
    indexes1 = v1.rows.item()[:]
    indexes2 = v2.rows.item()[:]
    if indexes1.length != indexes2.length:
        raise ValueError
    indexes = indexes1 + indexes2  # 2つのベクトルの疎でない所のindex
    euc_dis = 0.0
    for index in indexes:
        _dis = v1[0, index] - v2[0, index]
        euc_dis += _dis ** 2
    return math.sqrt(euc_dis)

from: https://qiita.com/petitviolet/items/545f49e3da2d50b95ed7