S _ Label Studio - dwisianto/dwisianto GitHub Wiki

Quick Starts

  • Label Studio Server uses port 8080
    • netstat -nlp | grep 8080
    • lsof -i -P | grep -i "listen"
  • Start the Label Studio Server
    • label-studio &
    • The data directory is located at ~/.local/share/label-studio/
      • sqlite3 label_studio.sqlite3
  • Disable Signup Page
    • export LABEL_STUDIO_DISABLE_SIGNUP_WITHOUT_LINK=true
    • label-studio start --username [email protected] --password myPass
  • label-studio --data-dir studio-directory
    • export L_S_PASS=my-pass
    • label-studio --data-dir my-dir --username my_name --password $L_S_PASS
  • projects
    • label-studio init mine
    • label-studio start mine
  • label-studio -db
  • label-studio -p PORT
  • label-studio -h
  • label-studio-ml

Bash os

export LS_HOME=~/d2/s2/w2-/wk-/lstudio22/ls22a
export LS_CONF=$HOME/.local/share/label-studio
export ls_home=$LS_HOME
export ls_conf=$LS_CONF
export ls_work=~/d/s/w/

Anaconda Env

- conda create -y --name myEnv --clone base
  - conda create -y --name ls22py39 python==3.8
  - conda install virtualenv
  - virtualenv ls22py38a
  - source ls22py38a/bin/activate
  - pip install label-studio
  - postgress
    - https://www.moncefbelyamani.com/how-to-install-postgresql-on-a-mac-with-homebrew-and-lunchy/
    - https://stackoverflow.com/questions/20170895/mac-virtualenv-pip-postgresql-error-pg-config-executable-not-found
    - brew install postgres
    - emacs -nw ~/.zshrc
      - export PATH=$PATH:/Applications/Postgres.app/Contents/Versions/<current_version>/bin    
      - export PATH=$PATH:/opt/homebrew/opt/postgres/bin
echo "label-studio --data-dir ./my-dir --username [email protected] --password admin"

Script

#ANA_ENV=$(which label-studio)
#LS_BIN=${ANA_ENV}/bin/label-studio
LS_BIN=$(which label-studio)
LS_HOST=localhost
LS_PORT=8090

#
LS0_USER_PASS=abcde
[email protected]
[email protected]
LS0_DAT_DIR=~/.local/share/label-studio
LS1_DAT_DIR=${LS0_DAT_DIR}"-2022-08-14"
LS2_DAT_DIR=${LS0_DAT_DIR}"-2022-08-15"


# Disable Signup Page
export LABEL_STUDIO_DISABLE_SIGNUP_WITHOUT_LINK=true
# label-studio start --username [email protected] --password <myPass>


LS_ARG="--port "${LS_PORT}" --password "${LS0_USER_PASS}
act() {

    case $* in
        a )
            echo "export LABEL_STUDIO_DISABLE_SIGNUP_WITHOUT_LINK=true; ${LS_BIN} start --data-dir ${LS1_DAT_DIR} --username ${LS1_USER_NAME} "${LS_ARG}
            ;;
        b )
            echo "${LS_BIN} start --data-dir ${LS2_DAT_DIR} --username ${LS2_USER_NAME} "${LS_ARG}
            ;;
        h )
            echo "${LS_BIN} help"
            ;;
    esac
}

act $*

Spacy NER

import spacy
import pandas as pd
import json
from itertools import groupby

# Download spaCy models:
models = {
    'en_core_web_sm': spacy.load("en_core_web_sm"),
    'en_core_web_lg': spacy.load("en_core_web_lg")
}

# This function converts spaCy docs to the list of named entity spans in Label Studio compatible JSON format:
def doc_to_spans(doc):
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    results = []
    entities = set()
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        text = ' '.join(item[0] for item in group)
        end = last + len(word)
        results.append({
            'from_name': 'label',
            'to_name': 'text',
            'type': 'labels',
            'value': {
                'start': start,
                'end': end,
                'text': text,
                'labels': [entity]
            }
        })
        entities.add(entity)

    return results, entities

# Now load the dataset and include only lines containing "Easter ":
df = pd.read_csv('lines_clean.csv')
df = df[df['line_text'].str.contains("Easter ", na=False)]
print(df.head())
texts = df['line_text']

# Prepare Label Studio tasks in import JSON format with the model predictions:
entities = set()
tasks = []
for text in texts:
    predictions = []
    for model_name, nlp in models.items():
        doc = nlp(text)
        spans, ents = doc_to_spans(doc)
        entities |= ents
        predictions.append({'model_version': model_name, 'result': spans})
    tasks.append({
        'data': {'text': text},
        'predictions': predictions
    })

# Save Label Studio tasks.json
print(f'Save {len(tasks)} tasks to "tasks.json"')
with open('tasks.json', mode='w') as f:
    json.dump(tasks, f, indent=2)
    
# Save class labels as a txt file
print('Named entities are saved to "named_entities.txt"')
with open('named_entities.txt', mode='w') as f:
    f.write('\n'.join(sorted(entities)))

ToDo

import json
from collections import defaultdict

tasks = json.load(open('annotations.json'))
model_hits = defaultdict(int)

for task in tasks:
    annotation_result = task['annotations'][0]['result']
    for r in annotation_result:
        r.pop('id')
    for prediction in task['predictions']:
        model_hits[prediction['model_version']] += int(prediction['result'] == annotation_result)

num_task = len(tasks)
for model_name, num_hits in model_hits.items():
    acc = num_hits / num_task
    print(f'Accuracy for {model_name}: {acc:.2f}%')

References

  • datasaur.ai
    • label-studio vs datasaur.ai
⚠️ **GitHub.com Fallback** ⚠️