c++去掉汉语文本中的某些不明空白符

本人对汉语文本进行分词,由于中间夹杂了某些不明的空白符,无法按照去掉空格,换行回车这些方式去去除,这样分离出的有些汉语词语前面总是留有空白。注:处理的文本为ANSI编码方式,求大侠帮忙……

作者: ccnunlp   发布时间: 2011-06-16

这是分词后一个行显示一个词的结果:
版权




通知
    国家
    
      关于
图书
合同
           
著作权
实施
试行
  “国家”前面就有不明的空白

作者: ccnunlp   发布时间: 2011-06-16

C/C++ code
#include <windows.h>
#include <stdio.h>

int main() {
    const DWORD uWidth = 18 + 17 * 256, uHeight = 18 + 17 * 128;

    PBITMAPINFO pbmi = (PBITMAPINFO) LocalAlloc (LPTR, sizeof (BITMAPINFOHEADER) + sizeof (RGBQUAD) * 2);
    pbmi->bmiHeader.biSize = sizeof (BITMAPINFOHEADER);
    pbmi->bmiHeader.biWidth = uWidth;
    pbmi->bmiHeader.biHeight = uHeight;
    pbmi->bmiHeader.biPlanes = 1;
    pbmi->bmiHeader.biBitCount = 1;
    pbmi->bmiHeader.biSizeImage = ((uWidth + 31) & ~31) / 8 * uHeight;
    pbmi->bmiColors[0].rgbBlue = 0;
    pbmi->bmiColors[0].rgbGreen = 0;
    pbmi->bmiColors[0].rgbRed = 0;
    pbmi->bmiColors[1].rgbBlue = 255;
    pbmi->bmiColors[1].rgbGreen = 255;
    pbmi->bmiColors[1].rgbRed = 255;

    HDC hDC = CreateCompatibleDC (0);
    void * pvBits;
    HBITMAP hBitmap = CreateDIBSection (hDC, pbmi, 0, &pvBits, NULL, 0);
    SelectObject (hDC, hBitmap);
    HFONT hFont = CreateFont (16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "宋体");
    SelectObject (hDC, hFont);
    BitBlt (hDC, 0, 0, uWidth, uHeight, NULL, 0, 0, WHITENESS);

    char c[4];
    int i, j;
    for (i = 128; i < 256; i++) {
        sprintf (c, "%02X", i);
        TextOut (hDC, 1, (i - 127) * 17 + 1, c, 2);
    }
    for (j = 0; j < 256; j++) {
        sprintf (c, "%02X", j);
        TextOut (hDC, (j + 1)* 17 + 1, 1, c, 2);
    }
    for (i = 128; i < 256; i++) {
        for (j = 0; j < 256; j++) {
            c[0] = (char) i;
            c[1] = (char) j;
            TextOut (hDC, (j + 1) * 17 + 1, (i - 127) * 17 + 1, c, 2);
        }
    }
    for (i = 0; i < 130; i++) {
        MoveToEx (hDC, 0, i * 17, NULL);
        LineTo (hDC, uWidth, i * 17);
    }
    for (j = 0; j < 258; j++) {
        MoveToEx (hDC, j * 17, 0, NULL);
        LineTo (hDC, j * 17, uHeight);
    }

    BITMAPFILEHEADER bmfh;
    bmfh.bfType = *(PWORD) "BM";
    bmfh.bfReserved1 = 0;
    bmfh.bfReserved2 = 0;
    bmfh.bfOffBits = sizeof (BITMAPFILEHEADER) + sizeof (BITMAPINFOHEADER) + sizeof (RGBQUAD) * 2;
    bmfh.bfSize = bmfh.bfOffBits + pbmi->bmiHeader.biSizeImage;

    HANDLE hFile = CreateFile ("goal.bmp", GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0);
    if (hFile != INVALID_HANDLE_VALUE) {
        DWORD dwWritten;
        WriteFile (hFile, &bmfh, sizeof (BITMAPFILEHEADER), &dwWritten, NULL);
        WriteFile (hFile, pbmi, sizeof (BITMAPINFOHEADER) + sizeof (RGBQUAD) * 2, &dwWritten, NULL);
        WriteFile (hFile, pvBits, pbmi->bmiHeader.biSizeImage, &dwWritten, NULL);

        CloseHandle (hFile);
    }

    DeleteObject (hFont);
    DeleteObject (hBitmap);
    DeleteDC (hDC);
    LocalFree (pbmi);

    return 0;
}





作者: zhao4zhong1   发布时间: 2011-06-16