


from tqdm.notebook import tqdm
import os

dataset = []
with open(train_file, 'r') as file:
    for line in tqdm(file.readlines()):
        data = json.loads(line.strip())

你可以按照 CLUENER 的格式准备训练数据,

{'text': '胡建新经营着位于深圳市福田区华富街道田面社区深南中路4028号田面城市大厦19B-19C的公司。',
 'label': {'person': {'胡建新': [[0, 2]]},
  'address': {'深圳市福田区华富街道田面社区深南中路4028号田面城市大厦19B-19C': [[8, 43]]}}}


import random
import numpy as np

def split_train_test_valid(dataset, train_size=0.8, test_size=0.1):
    dataset = np.array(dataset)
    total_size = len(dataset)
    # define the ratios
    train_len = int(total_size * train_size)
    test_len = int(total_size * test_size)

    # split the dataframe
    idx = list(range(total_size))
    random.shuffle(idx)  # 将index列表打乱
    data_train = dataset[idx[:train_len]]
    data_test = dataset[idx[train_len:train_len+test_len]]
    data_valid = dataset[idx[train_len+test_len:]]  # 剩下的就是valid
    return data_train, data_test, data_valid

data_train, data_test, data_valid = split_train_test_valid(dataset)

转化成 spacy docbin 格式

from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

def to_docbin(dataset):
    data = dataset
    data_spacy = []
    for d in tqdm(data):
        text = d['text']
        tags = []
        labels = d['label']
        for label in labels:
            entities = labels[label]
            for entity in entities:
                for loc in entities[entity]:
                    tags.append((loc[0], loc[1]+1, label))
        data_spacy.append({"text":text, "entities": tags})
    nlp = spacy.blank('zh')   # 选择中文空白模型
    doc_bin = DocBin()
    for training_example in tqdm(data_spacy):
        text = training_example['text']
        labels = training_example['entities']
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents

    return doc_bin

doc_bin_train = to_docbin(data_train)
doc_bin_valid = to_docbin(data_valid)

训练集和验证集保存到了 train.spacyvalid.spacy


选择Chinese/ner/GPU,自动生成配置文件 base_config.cfg


python -m spacy init fill-config base_config.cfg config.cfg


python -m spacy train config.cfg --output . --paths.train ./train.spacy ./valid.spacy --gpu-id 0


 python -m spacy train config.cfg --output . --paths.train ./train.spacy ./dev.spacy --gpu-id 0
ℹ Saving to output directory: .
ℹ Using GPU: 0

=========================== Initializing pipeline ===========================
Some weights of the model checkpoint at ../models/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
✔ Initialized pipeline

============================= Training pipeline =============================
ℹ Pipeline: ['transformer', 'ner']
ℹ Initial learn rate: 0.0
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        2414.47    804.03    0.41    0.25    1.17    0.00
  0     200      553440.62  100815.50   25.73   27.65   24.06    0.26
  1     400      379529.80  55305.57   36.83   43.31   32.03    0.37
  2     600      164609.24  36629.69   62.07   60.54   63.67    0.62
  3     800      163662.29  38876.53   32.75   42.38   26.69    0.33
  4    1000       81601.30  28677.56   62.02   63.22   60.87    0.62
  5    1200       75558.20  26489.57   61.61   63.17   60.12    0.62
  6    1400       87824.25  25230.27   69.77   69.59   69.95    0.70
  6    1600       54173.95  21436.94   70.03   69.52   70.54    0.70
  7    1800       30978.67  15641.39   71.80   72.03   71.58    0.72
  8    2000       27723.05  13770.74   69.07   69.53   68.62    0.69
  9    2200       25622.08  12936.05   72.89   71.89   73.93    0.73
 10    2400       24126.19  13338.83   71.58   71.96   71.19    0.72
 11    2600       21804.75  11238.43   74.20   74.82   73.60    0.74
 12    2800       20628.26  10916.07   71.44   71.39   71.48    0.71
 13    3000       20134.37  11081.41   72.51   72.17   72.85    0.73
 14    3200       16227.69   8933.84   74.17   73.84   74.51    0.74
 14    3400       19235.74   9438.10   72.00   73.18   70.87    0.72
 15    3600       29307.03  12692.90   74.84   76.13   73.60    0.75
 16    3800       18102.06   8969.09   73.38   71.82   75.00    0.73
 17    4000       14903.23   8416.16   73.11   71.91   74.35    0.73
 18    4200       19608.45   9377.10   72.91   72.67   73.14    0.73
 19    4400       17153.18   8931.95   74.35   74.20   74.51    0.74
 20    4600       17934.71   9112.66   66.37   67.00   65.76    0.66
 20    4800       13376.17   7252.01   74.06   74.29   73.83    0.74
 21    5000       13659.26   6804.46   72.38   71.47   73.31    0.72
 22    5200       18188.32   8358.28   73.57   72.22   74.97    0.74
✔ Saved pipeline to output directory

验证集 F1 score 达到了 0.75,相比比非transform的模型的 0.65 如下,结果是有明显提升的:

ℹ Saving to output directory: .
ℹ Using GPU: 0

=========================== Initializing pipeline ===========================
✔ Initialized pipeline

============================= Training pipeline =============================
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.29    0.09    0.15    0.07    0.00
  0     200        496.94   3348.46    5.82    4.36    8.76    0.06
  0     400       1408.31   4107.52    9.38   20.41    6.09    0.09
  0     600       2121.99   5357.34   17.45   23.00   14.06    0.17
  0     800       1096.04   5009.92   19.90   27.89   15.46    0.20
  0    1000        931.30   5447.63   27.72   33.77   23.50    0.28
  0    1200       1375.05   6551.97   32.09   38.83   27.34    0.32
  0    1400       1388.81   7116.59   37.61   43.81   32.94    0.38
  0    1600       2521.46   9638.09   42.25   52.07   35.55    0.42
  1    1800       2172.77  10659.31   40.53   48.04   35.06    0.41
  1    2000       3563.99  12454.60   43.00   49.98   37.73    0.43
  1    2200       4926.80  15747.33   46.38   50.38   42.97    0.46
  2    2400       4712.95  18150.01   48.91   53.97   44.73    0.49
  2    2600       4945.91  18023.03   50.25   53.30   47.53    0.50
  3    2800       6100.79  18400.07   51.21   54.85   48.01    0.51
  3    3000       5124.39  17074.50   51.38   54.62   48.50    0.51
  4    3200       5595.23  17486.11   52.83   57.31   48.99    0.53
  4    3400       5857.02  16183.54   52.39   55.95   49.25    0.52
  5    3600       7097.00  16779.79   55.20   58.97   51.89    0.55
  5    3800       7305.36  16330.97   53.70   56.30   51.33    0.54
  6    4000       6912.16  15848.24   55.86   57.40   54.39    0.56
  6    4200       7083.29  15591.03   54.72   57.02   52.60    0.55
  7    4400       7072.32  14623.82   55.80   61.07   51.37    0.56
  7    4600       9153.78  15341.62   57.24   58.95   55.63    0.57
  8    4800       7584.10  14801.21   54.85   56.26   53.52    0.55
  8    5000       7514.11  14013.45   58.38   61.83   55.31    0.58
  9    5200       9505.86  14416.66   57.41   60.38   54.72    0.57
  9    5400       8458.73  13544.08   58.90   62.29   55.86    0.59
 10    5600       9179.71  12723.23   58.53   60.97   56.28    0.59
 10    5800       9730.11  13078.69   58.85   62.58   55.53    0.59
 11    6000       8485.15  13275.12   59.14   62.02   56.51    0.59
 11    6200      10376.37  12896.16   58.77   60.26   57.36    0.59
 12    6400       8562.07  12582.15   58.59   62.72   54.98    0.59
 12    6600       8131.18  11650.52   59.21   62.55   56.22    0.59
 13    6800      10618.73  11832.74   58.46   60.77   56.32    0.58
 13    7000      10180.18  12106.64   59.16   61.23   57.23    0.59
 14    7200      10455.71  11767.56   62.46   65.60   59.60    0.62
 14    7400      10277.93  11417.25   61.00   61.90   60.12    0.61
 15    7600      10416.83  11844.74   61.50   63.19   59.90    0.61
 15    7800       9843.24  10815.69   60.73   63.61   58.11    0.61
 16    8000      10849.20  11080.88   62.16   65.61   59.05    0.62
 16    8200      12479.84  10464.58   60.54   63.07   58.20    0.61
 16    8400      11960.47  10947.46   63.05   64.79   61.39    0.63
 17    8600      12225.40  10741.32   63.00   64.06   61.98    0.63
 17    8800      11885.81  10653.15   63.88   66.43   61.52    0.64
 18    9000       9813.91   9519.76   62.38   65.15   59.83    0.62
 18    9200      11317.17  10009.74   62.36   65.20   59.77    0.62
 19    9400      11061.72  10646.52   62.66   63.56   61.78    0.63
 19    9600      11708.71   9658.76   62.61   66.30   59.31    0.63
 20    9800      11545.23  10812.54   64.21   65.83   62.66    0.64
 20   10000      12078.46   9654.99   63.09   64.35   61.88    0.63
 21   10200      11745.36   9246.17   61.87   64.31   59.60    0.62
 21   10400      11913.01   9916.31   62.74   64.24   61.30    0.63
 22   10600      11860.46   9340.68   64.30   66.44   62.30    0.64
 22   10800      13450.33   9669.23   63.20   64.48   61.98    0.63
 23   11000      13385.45   9062.81   63.31   65.10   61.62    0.63
 23   11200      13600.88   9135.41   63.88   65.94   61.95    0.64
 24   11400      14294.13   8782.87   63.87   65.69   62.14    0.64
 24   11600      18930.36   9024.00   63.06   64.11   62.04    0.63
 25   11800      14705.22   8806.56   63.40   66.38   60.68    0.63
 25   12000      17361.70   8958.72   64.71   66.28   63.22    0.65
 26   12200      14182.36   8224.55   64.20   66.21   62.30    0.64
 26   12400      15606.35   8725.44   64.23   66.68   61.95    0.64
 27   12600      11960.69   7855.59   64.27   64.61   63.93    0.64
 27   12800      12869.61   8011.05   63.80   66.58   61.23    0.64
 28   13000      13938.21   8064.88   64.14   65.55   62.79    0.64
 28   13200      12936.39   8126.91   65.23   66.64   63.87    0.65
 29   13400      11387.84   7295.93   64.38   64.87   63.90    0.64
 29   13600      15525.57   8512.57   64.52   66.23   62.89    0.65
 30   13800      13474.02   8028.01   65.55   67.37   63.83    0.66
 30   14000      16685.29   7827.30   64.15   64.61   63.70    0.64
 31   14200      15312.08   7759.34   65.53   66.29   64.78    0.66
 31   14400      16065.35   7711.75   64.03   65.93   62.24    0.64
 32   14600      16316.15   7407.74   65.02   66.08   64.00    0.65
 32   14800      16318.76   7667.86   64.97   66.60   63.41    0.65
 33   15000      14086.54   7523.11   64.96   68.17   62.04    0.65
 33   15200      16476.11   7485.34   64.86   67.14   62.73    0.65
 34   15400      16635.40   7954.74   64.90   66.50   63.38    0.65
✔ Saved pipeline to output directory





